qspinlock_paravirt.h 15.2 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
2 3 4 5 6 7
#ifndef _GEN_PV_LOCK_SLOWPATH
#error "do not include this file"
#endif

#include <linux/hash.h>
#include <linux/bootmem.h>
8
#include <linux/debug_locks.h>
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25

/*
 * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
 * of spinning them.
 *
 * This relies on the architecture to provide two paravirt hypercalls:
 *
 *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
 *   pv_kick(cpu)             -- wakes a suspended vcpu
 *
 * Using these we implement __pv_queued_spin_lock_slowpath() and
 * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
 * native_queued_spin_unlock().
 */

#define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)

26 27 28 29 30 31 32 33 34 35 36 37 38 39
/*
 * Queue Node Adaptive Spinning
 *
 * A queue node vCPU will stop spinning if the vCPU in the previous node is
 * not running. The one lock stealing attempt allowed at slowpath entry
 * mitigates the slight slowdown for non-overcommitted guest with this
 * aggressive wait-early mechanism.
 *
 * The status of the previous node will be checked at fixed interval
 * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
 * pound on the cacheline of the previous node too heavily.
 */
#define PV_PREV_CHECK_MASK	0xff

40 41 42 43
/*
 * Queue node uses: vcpu_running & vcpu_halted.
 * Queue head uses: vcpu_running & vcpu_hashed.
 */
44 45
enum vcpu_state {
	vcpu_running = 0,
46 47
	vcpu_halted,		/* Used only in pv_wait_node */
	vcpu_hashed,		/* = pv_hash'ed + vcpu_halted */
48 49 50 51 52 53 54 55 56 57
};

struct pv_node {
	struct mcs_spinlock	mcs;
	struct mcs_spinlock	__res[3];

	int			cpu;
	u8			state;
};

58 59 60 61 62
/*
 * Include queued spinlock statistics code
 */
#include "qspinlock_stat.h"

63 64 65 66 67 68 69 70 71 72
/*
 * By replacing the regular queued_spin_trylock() with the function below,
 * it will be called once when a lock waiter enter the PV slowpath before
 * being queued. By allowing one lock stealing attempt here when the pending
 * bit is off, it helps to reduce the performance impact of lock waiter
 * preemption without the drawback of lock starvation.
 */
#define queued_spin_trylock(l)	pv_queued_spin_steal_lock(l)
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
{
73
	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
74
	    (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
75 76 77 78 79
		qstat_inc(qstat_pv_lock_stealing, true);
		return true;
	}

	return false;
80 81 82 83 84 85 86 87 88
}

/*
 * The pending bit is used by the queue head vCPU to indicate that it
 * is actively spinning on the lock and no lock stealing is allowed.
 */
#if _Q_PENDING_BITS == 8
static __always_inline void set_pending(struct qspinlock *lock)
{
89
	WRITE_ONCE(lock->pending, 1);
90 91 92 93
}

/*
 * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
94 95
 * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
 * lock just to be sure that it will get it.
96 97 98
 */
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
99 100
	return !READ_ONCE(lock->locked) &&
	       (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
101
				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
102 103 104 105
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
{
106
	atomic_or(_Q_PENDING_VAL, &lock->val);
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
}

static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
	int val = atomic_read(&lock->val);

	for (;;) {
		int old, new;

		if (val  & _Q_LOCKED_MASK)
			break;

		/*
		 * Try to clear pending bit & set locked bit
		 */
		old = val;
		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
124
		val = atomic_cmpxchg_acquire(&lock->val, old, new);
125 126 127 128 129 130 131 132

		if (val == old)
			return 1;
	}
	return 0;
}
#endif /* _Q_PENDING_BITS == 8 */

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
/*
 * Lock and MCS node addresses hash table for fast lookup
 *
 * Hashing is done on a per-cacheline basis to minimize the need to access
 * more than one cacheline.
 *
 * Dynamically allocate a hash table big enough to hold at least 4X the
 * number of possible cpus in the system. Allocation is done on page
 * granularity. So the minimum number of hash buckets should be at least
 * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
 *
 * Since we should not be holding locks from NMI context (very rare indeed) the
 * max load factor is 0.75, which is around the point where open addressing
 * breaks down.
 *
 */
struct pv_hash_entry {
	struct qspinlock *lock;
	struct pv_node   *node;
};

#define PV_HE_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
#define PV_HE_MIN	(PAGE_SIZE / sizeof(struct pv_hash_entry))

static struct pv_hash_entry *pv_lock_hash;
static unsigned int pv_lock_hash_bits __read_mostly;

/*
 * Allocate memory for the PV qspinlock hash buckets
 *
 * This function should be called from the paravirt spinlock initialization
 * routine.
 */
void __init __pv_init_lock_hash(void)
{
	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);

	if (pv_hash_size < PV_HE_MIN)
		pv_hash_size = PV_HE_MIN;

	/*
	 * Allocate space from bootmem which should be page-size aligned
	 * and hence cacheline aligned.
	 */
	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
					       sizeof(struct pv_hash_entry),
179 180
					       pv_hash_size, 0,
					       HASH_EARLY | HASH_ZERO,
181 182 183 184 185 186 187 188 189 190 191 192 193
					       &pv_lock_hash_bits, NULL,
					       pv_hash_size, pv_hash_size);
}

#define for_each_hash_entry(he, offset, hash)						\
	for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;	\
	     offset < (1 << pv_lock_hash_bits);						\
	     offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])

static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{
	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
	struct pv_hash_entry *he;
194
	int hopcnt = 0;
195 196

	for_each_hash_entry(he, offset, hash) {
197
		hopcnt++;
198 199
		if (!cmpxchg(&he->lock, NULL, lock)) {
			WRITE_ONCE(he->node, node);
200
			qstat_hop(hopcnt);
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
			return &he->lock;
		}
	}
	/*
	 * Hard assume there is a free entry for us.
	 *
	 * This is guaranteed by ensuring every blocked lock only ever consumes
	 * a single entry, and since we only have 4 nesting levels per CPU
	 * and allocated 4*nr_possible_cpus(), this must be so.
	 *
	 * The single entry is guaranteed by having the lock owner unhash
	 * before it releases.
	 */
	BUG();
}

static struct pv_node *pv_unhash(struct qspinlock *lock)
{
	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
	struct pv_hash_entry *he;
	struct pv_node *node;

	for_each_hash_entry(he, offset, hash) {
		if (READ_ONCE(he->lock) == lock) {
			node = READ_ONCE(he->node);
			WRITE_ONCE(he->lock, NULL);
			return node;
		}
	}
	/*
	 * Hard assume we'll find an entry.
	 *
	 * This guarantees a limited lookup time and is itself guaranteed by
	 * having the lock owner do the unhash -- IFF the unlock sees the
	 * SLOW flag, there MUST be a hash entry.
	 */
	BUG();
}

240 241 242 243 244 245 246 247 248 249
/*
 * Return true if when it is time to check the previous node which is not
 * in a running state.
 */
static inline bool
pv_wait_early(struct pv_node *prev, int loop)
{
	if ((loop & PV_PREV_CHECK_MASK) != 0)
		return false;

250
	return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
251 252
}

253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
/*
 * Initialize the PV part of the mcs_spinlock node.
 */
static void pv_init_node(struct mcs_spinlock *node)
{
	struct pv_node *pn = (struct pv_node *)node;

	BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));

	pn->cpu = smp_processor_id();
	pn->state = vcpu_running;
}

/*
 * Wait for node->locked to become true, halt the vcpu after a short spin.
268 269
 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
 * behalf.
270
 */
271
static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
272 273
{
	struct pv_node *pn = (struct pv_node *)node;
274
	struct pv_node *pp = (struct pv_node *)prev;
275
	int loop;
276
	bool wait_early;
277

278
	for (;;) {
279
		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
280 281
			if (READ_ONCE(node->locked))
				return;
282 283 284 285
			if (pv_wait_early(pp, loop)) {
				wait_early = true;
				break;
			}
286 287 288 289 290 291 292 293
			cpu_relax();
		}

		/*
		 * Order pn->state vs pn->locked thusly:
		 *
		 * [S] pn->state = vcpu_halted	  [S] next->locked = 1
		 *     MB			      MB
294
		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
295
		 *
296
		 * Matches the cmpxchg() from pv_kick_node().
297
		 */
298
		smp_store_mb(pn->state, vcpu_halted);
299

300 301
		if (!READ_ONCE(node->locked)) {
			qstat_inc(qstat_pv_wait_node, true);
302
			qstat_inc(qstat_pv_wait_early, wait_early);
303
			pv_wait(&pn->state, vcpu_halted);
304
		}
305 306

		/*
307
		 * If pv_kick_node() changed us to vcpu_hashed, retain that
308 309
		 * value so that pv_wait_head_or_lock() knows to not also try
		 * to hash this lock.
310
		 */
311
		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
312 313 314 315 316 317 318 319

		/*
		 * If the locked flag is still not set after wakeup, it is a
		 * spurious wakeup and the vCPU should wait again. However,
		 * there is a pretty high overhead for CPU halting and kicking.
		 * So it is better to spin for a while in the hope that the
		 * MCS lock will be released soon.
		 */
320
		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
321
	}
322

323 324 325 326 327 328 329 330
	/*
	 * By now our node->locked should be 1 and our caller will not actually
	 * spin-wait for it. We do however rely on our caller to do a
	 * load-acquire for us.
	 */
}

/*
331 332
 * Called after setting next->locked = 1 when we're the lock owner.
 *
333 334 335
 * Instead of waking the waiters stuck in pv_wait_node() advance their state
 * such that they're waiting in pv_wait_head_or_lock(), this avoids a
 * wake/sleep cycle.
336
 */
337
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
338 339 340 341
{
	struct pv_node *pn = (struct pv_node *)node;

	/*
342 343 344
	 * If the vCPU is indeed halted, advance its state to match that of
	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
	 * observe its next->locked value and advance itself.
345
	 *
346
	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
347 348 349 350 351 352 353 354
	 *
	 * The write to next->locked in arch_mcs_spin_unlock_contended()
	 * must be ordered before the read of pn->state in the cmpxchg()
	 * below for the code to work correctly. To guarantee full ordering
	 * irrespective of the success or failure of the cmpxchg(),
	 * a relaxed version with explicit barrier is used. The control
	 * dependency will order the reading of pn->state before any
	 * subsequent writes.
355
	 */
356 357 358
	smp_mb__before_atomic();
	if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
	    != vcpu_halted)
359 360 361 362
		return;

	/*
	 * Put the lock into the hash table and set the _Q_SLOW_VAL.
363
	 *
364 365 366
	 * As this is the same vCPU that will check the _Q_SLOW_VAL value and
	 * the hash table later on at unlock time, no atomic instruction is
	 * needed.
367
	 */
368
	WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
369
	(void)pv_hash(lock, pn);
370 371 372
}

/*
373 374
 * Wait for l->locked to become clear and acquire the lock;
 * halt the vcpu after a short spin.
375
 * __pv_queued_spin_unlock() will wake us.
376 377
 *
 * The current value of the lock will be returned for additional processing.
378
 */
379 380
static u32
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
381 382 383
{
	struct pv_node *pn = (struct pv_node *)node;
	struct qspinlock **lp = NULL;
384
	int waitcnt = 0;
385 386
	int loop;

387 388 389 390 391 392 393
	/*
	 * If pv_kick_node() already advanced our state, we don't need to
	 * insert ourselves into the hash table anymore.
	 */
	if (READ_ONCE(pn->state) == vcpu_hashed)
		lp = (struct qspinlock **)1;

394 395 396 397 398
	/*
	 * Tracking # of slowpath locking operations
	 */
	qstat_inc(qstat_pv_lock_slowpath, true);

399
	for (;; waitcnt++) {
400 401 402 403 404 405
		/*
		 * Set correct vCPU state to be used by queue node wait-early
		 * mechanism.
		 */
		WRITE_ONCE(pn->state, vcpu_running);

406 407 408 409 410
		/*
		 * Set the pending bit in the active lock spinning loop to
		 * disable lock stealing before attempting to acquire the lock.
		 */
		set_pending(lock);
411
		for (loop = SPIN_THRESHOLD; loop; loop--) {
412 413
			if (trylock_clear_pending(lock))
				goto gotlock;
414 415
			cpu_relax();
		}
416 417
		clear_pending(lock);

418 419 420

		if (!lp) { /* ONCE */
			lp = pv_hash(lock, pn);
421

422
			/*
423 424 425
			 * We must hash before setting _Q_SLOW_VAL, such that
			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
			 * we'll be sure to be able to observe our hash entry.
426
			 *
427 428 429
			 *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
			 *       MB                           RMB
			 * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
430
			 *
431
			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
432
			 */
433
			if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
434
				/*
435 436 437
				 * The lock was free and now we own the lock.
				 * Change the lock value back to _Q_LOCKED_VAL
				 * and unhash the table.
438
				 */
439
				WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
440
				WRITE_ONCE(*lp, NULL);
441
				goto gotlock;
442 443
			}
		}
444
		WRITE_ONCE(pn->state, vcpu_hashed);
445 446
		qstat_inc(qstat_pv_wait_head, true);
		qstat_inc(qstat_pv_wait_again, waitcnt);
447
		pv_wait(&lock->locked, _Q_SLOW_VAL);
448 449

		/*
450 451
		 * Because of lock stealing, the queue head vCPU may not be
		 * able to acquire the lock before it has to wait again.
452 453 454 455
		 */
	}

	/*
456 457 458 459
	 * The cmpxchg() or xchg() call before coming here provides the
	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
	 * here is to indicate to the compiler that the value will always
	 * be nozero to enable better code optimization.
460
	 */
461 462
gotlock:
	return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
463 464 465
}

/*
466 467
 * PV versions of the unlock fastpath and slowpath functions to be used
 * instead of queued_spin_unlock().
468
 */
469 470
__visible void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
471 472 473
{
	struct pv_node *node;

474 475 476 477
	if (unlikely(locked != _Q_SLOW_VAL)) {
		WARN(!debug_locks_silent,
		     "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
		     (unsigned long)lock, atomic_read(&lock->val));
478 479 480
		return;
	}

481 482 483 484 485
	/*
	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
	 * so we need a barrier to order the read of the node data in
	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
	 *
486
	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
487 488 489
	 */
	smp_rmb();

490 491 492 493 494 495 496 497 498 499
	/*
	 * Since the above failed to release, this must be the SLOW path.
	 * Therefore start by looking up the blocked node and unhashing it.
	 */
	node = pv_unhash(lock);

	/*
	 * Now that we have a reference to the (likely) blocked pv_node,
	 * release the lock.
	 */
500
	smp_store_release(&lock->locked, 0);
501 502 503 504

	/*
	 * At this point the memory pointed at by lock can be freed/reused,
	 * however we can still use the pv_node to kick the CPU.
505 506 507
	 * The other vCPU may not really be halted, but kicking an active
	 * vCPU is harmless other than the additional latency in completing
	 * the unlock.
508
	 */
509
	qstat_inc(qstat_pv_kick_unlock, true);
510
	pv_kick(node->cpu);
511
}
512

513 514 515
/*
 * Include the architecture specific callee-save thunk of the
 * __pv_queued_spin_unlock(). This thunk is put together with
516 517 518 519
 * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
 * function close to each other sharing consecutive instruction cachelines.
 * Alternatively, architecture specific version of __pv_queued_spin_unlock()
 * can be defined.
520 521 522
 */
#include <asm/qspinlock_paravirt.h>

523 524 525 526 527 528 529 530 531 532
#ifndef __pv_queued_spin_unlock
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
	u8 locked;

	/*
	 * We must not unlock if SLOW, because in that case we must first
	 * unhash. Otherwise it would be possible to have multiple @lock
	 * entries, which would be BAD.
	 */
533
	locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
534 535 536 537 538 539
	if (likely(locked == _Q_LOCKED_VAL))
		return;

	__pv_queued_spin_unlock_slowpath(lock, locked);
}
#endif /* __pv_queued_spin_unlock */