workqueue.c 156 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
Tejun Heo's avatar
Tejun Heo committed
2
 * kernel/workqueue.c - generic async execution with shared worker pool
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Tejun Heo's avatar
Tejun Heo committed
4
 * Copyright (C) 2002		Ingo Molnar
Linus Torvalds's avatar
Linus Torvalds committed
5
 *
Tejun Heo's avatar
Tejun Heo committed
6 7 8 9 10
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
Linus Torvalds's avatar
Linus Torvalds committed
11
 *
Tejun Heo's avatar
Tejun Heo committed
12
 * Made to use alloc_percpu by Christoph Lameter.
Linus Torvalds's avatar
Linus Torvalds committed
13
 *
Tejun Heo's avatar
Tejun Heo committed
14 15
 * Copyright (C) 2010		SUSE Linux Products GmbH
 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16
 *
Tejun Heo's avatar
Tejun Heo committed
17 18
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
19 20 21 22
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
Tejun Heo's avatar
Tejun Heo committed
23
 *
24
 * Please read Documentation/core-api/workqueue.rst for details.
Linus Torvalds's avatar
Linus Torvalds committed
25 26
 */

27
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
28 29 30 31 32 33 34 35 36 37
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
38
#include <linux/hardirq.h>
39
#include <linux/mempolicy.h>
40
#include <linux/freezer.h>
41 42
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
43
#include <linux/lockdep.h>
Tejun Heo's avatar
Tejun Heo committed
44
#include <linux/idr.h>
45
#include <linux/jhash.h>
46
#include <linux/hashtable.h>
47
#include <linux/rculist.h>
48
#include <linux/nodemask.h>
49
#include <linux/moduleparam.h>
50
#include <linux/uaccess.h>
51

52
#include "workqueue_internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
53

Tejun Heo's avatar
Tejun Heo committed
54
enum {
55 56
	/*
	 * worker_pool flags
57
	 *
58
	 * A bound pool is either associated or disassociated with its CPU.
59 60 61 62 63 64
	 * While associated (!DISASSOCIATED), all workers are bound to the
	 * CPU and none has %WORKER_UNBOUND set and concurrency management
	 * is in effect.
	 *
	 * While DISASSOCIATED, the cpu may be offline and all workers have
	 * %WORKER_UNBOUND set and concurrency management disabled, and may
65
	 * be executing on any CPU.  The pool behaves as an unbound one.
66
	 *
67
	 * Note that DISASSOCIATED should be flipped only while holding
68
	 * attach_mutex to avoid changing binding state while
69
	 * worker_attach_to_pool() is in progress.
70
	 */
71
	POOL_MANAGER_ACTIVE	= 1 << 0,	/* being managed */
72
	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
73

Tejun Heo's avatar
Tejun Heo committed
74 75 76
	/* worker flags */
	WORKER_DIE		= 1 << 1,	/* die die die */
	WORKER_IDLE		= 1 << 2,	/* is idle */
77
	WORKER_PREP		= 1 << 3,	/* preparing to run works */
78
	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
79
	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
80
	WORKER_REBOUND		= 1 << 8,	/* worker was rebound */
81

82 83
	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_CPU_INTENSIVE |
				  WORKER_UNBOUND | WORKER_REBOUND,
84

85
	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
86

87
	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */
Tejun Heo's avatar
Tejun Heo committed
88
	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
89

90 91 92
	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */

93 94 95
	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
						/* call for help after 10ms
						   (min two ticks) */
96 97 98 99 100
	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */

	/*
	 * Rescue workers are used only on emergencies and shared by
101
	 * all cpus.  Give MIN_NICE.
102
	 */
103 104
	RESCUER_NICE_LEVEL	= MIN_NICE,
	HIGHPRI_NICE_LEVEL	= MIN_NICE,
105 106

	WQ_NAME_LEN		= 24,
Tejun Heo's avatar
Tejun Heo committed
107
};
Linus Torvalds's avatar
Linus Torvalds committed
108 109

/*
Tejun Heo's avatar
Tejun Heo committed
110 111
 * Structure fields follow one of the following exclusion rules.
 *
112 113
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
Tejun Heo's avatar
Tejun Heo committed
114
 *
115 116 117
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
118
 * L: pool->lock protected.  Access with pool->lock held.
Tejun Heo's avatar
Tejun Heo committed
119
 *
120 121 122 123
 * X: During normal operation, modification requires pool->lock and should
 *    be done only from local cpu.  Either disabling preemption on local
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
124
 *
125
 * A: pool->attach_mutex protected.
126
 *
127
 * PL: wq_pool_mutex protected.
128
 *
129
 * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
130
 *
131 132 133 134 135
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      sched-RCU for reads.
 *
136 137
 * WQ: wq->mutex protected.
 *
138
 * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
139 140
 *
 * MD: wq_mayday_lock protected.
Linus Torvalds's avatar
Linus Torvalds committed
141 142
 */

143
/* struct worker is defined in workqueue_internal.h */
Tejun Heo's avatar
Tejun Heo committed
144

145
struct worker_pool {
146
	spinlock_t		lock;		/* the pool lock */
147
	int			cpu;		/* I: the associated cpu */
148
	int			node;		/* I: the associated node ID */
Tejun Heo's avatar
Tejun Heo committed
149
	int			id;		/* I: pool ID */
150
	unsigned int		flags;		/* X: flags */
151

152 153
	unsigned long		watchdog_ts;	/* L: watchdog timestamp */

154 155
	struct list_head	worklist;	/* L: list of pending works */
	int			nr_workers;	/* L: total number of workers */
156 157

	/* nr_idle includes the ones off idle_list for rebinding */
158 159 160 161 162 163
	int			nr_idle;	/* L: currently idle ones */

	struct list_head	idle_list;	/* X: list of idle workers */
	struct timer_list	idle_timer;	/* L: worker idle timeout */
	struct timer_list	mayday_timer;	/* L: SOS timer for workers */

164
	/* a workers is either on busy_hash or idle_list, or the manager */
165 166 167
	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
						/* L: hash of busy workers */

168
	/* see manage_workers() for details on the two manager mutexes */
169
	struct worker		*manager;	/* L: purely informational */
170 171
	struct mutex		attach_mutex;	/* attach/detach exclusion */
	struct list_head	workers;	/* A: attached workers */
172
	struct completion	*detach_completion; /* all workers detached */
173

174
	struct ida		worker_ida;	/* worker IDs for task name */
175

176
	struct workqueue_attrs	*attrs;		/* I: worker attributes */
177 178
	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
	int			refcnt;		/* PL: refcnt for unbound pools */
179

180 181 182 183 184 185
	/*
	 * The current concurrency level.  As it's likely to be accessed
	 * from other CPUs during try_to_wake_up(), put it in a separate
	 * cacheline.
	 */
	atomic_t		nr_running ____cacheline_aligned_in_smp;
186 187 188 189 190 191

	/*
	 * Destruction of pool is sched-RCU protected to allow dereferences
	 * from get_work_pool().
	 */
	struct rcu_head		rcu;
192 193
} ____cacheline_aligned_in_smp;

Linus Torvalds's avatar
Linus Torvalds committed
194
/*
195 196 197 198
 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
Linus Torvalds's avatar
Linus Torvalds committed
199
 */
200
struct pool_workqueue {
201
	struct worker_pool	*pool;		/* I: the associated pool */
Tejun Heo's avatar
Tejun Heo committed
202
	struct workqueue_struct *wq;		/* I: the owning workqueue */
203 204
	int			work_color;	/* L: current color */
	int			flush_color;	/* L: flushing color */
Tejun Heo's avatar
Tejun Heo committed
205
	int			refcnt;		/* L: reference count */
206 207
	int			nr_in_flight[WORK_NR_COLORS];
						/* L: nr of in_flight works */
208
	int			nr_active;	/* L: nr of active works */
209
	int			max_active;	/* L: max active works */
210
	struct list_head	delayed_works;	/* L: delayed works */
211
	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
212
	struct list_head	mayday_node;	/* MD: node on wq->maydays */
Tejun Heo's avatar
Tejun Heo committed
213 214 215 216 217

	/*
	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
	 * itself is also sched-RCU protected so that the first pwq can be
218
	 * determined without grabbing wq->mutex.
Tejun Heo's avatar
Tejun Heo committed
219 220 221
	 */
	struct work_struct	unbound_release_work;
	struct rcu_head		rcu;
222
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
Linus Torvalds's avatar
Linus Torvalds committed
223

224 225 226 227
/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
228 229
	struct list_head	list;		/* WQ: list of flushers */
	int			flush_color;	/* WQ: flush color waiting for */
230 231 232
	struct completion	done;		/* flush completion */
};

233 234
struct wq_device;

Linus Torvalds's avatar
Linus Torvalds committed
235
/*
236 237
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
Linus Torvalds's avatar
Linus Torvalds committed
238 239
 */
struct workqueue_struct {
240
	struct list_head	pwqs;		/* WR: all pwqs of this wq */
241
	struct list_head	list;		/* PR: list of all workqueues */
242

243 244 245
	struct mutex		mutex;		/* protects this wq */
	int			work_color;	/* WQ: current work color */
	int			flush_color;	/* WQ: current flush color */
246
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
247 248 249
	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
	struct list_head	flusher_queue;	/* WQ: flush waiters */
	struct list_head	flusher_overflow; /* WQ: flush overflow list */
250

251
	struct list_head	maydays;	/* MD: pwqs requesting rescue */
252 253
	struct worker		*rescuer;	/* I: rescue worker */

254
	int			nr_drainers;	/* WQ: drain in progress */
255
	int			saved_max_active; /* WQ: saved pwq max_active */
256

257 258
	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */
259

260 261 262
#ifdef CONFIG_SYSFS
	struct wq_device	*wq_dev;	/* I: for sysfs interface */
#endif
263
#ifdef CONFIG_LOCKDEP
Tejun Heo's avatar
Tejun Heo committed
264
	struct lockdep_map	lockdep_map;
265
#endif
266
	char			name[WQ_NAME_LEN]; /* I: workqueue name */
267

268 269 270 271 272 273 274
	/*
	 * Destruction of workqueue_struct is sched-RCU protected to allow
	 * walking the workqueues list without grabbing wq_pool_mutex.
	 * This is used to dump all workqueues from sysrq.
	 */
	struct rcu_head		rcu;

275 276 277
	/* hot fields used during command issue, aligned to cacheline */
	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
278
	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
Linus Torvalds's avatar
Linus Torvalds committed
279 280
};

281 282
static struct kmem_cache *pwq_cache;

283 284 285
static cpumask_var_t *wq_numa_possible_cpumask;
					/* possible CPUs of each node */

286 287 288
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

289
/* see the comment above the definition of WQ_POWER_EFFICIENT */
290
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
291 292
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

Tejun Heo's avatar
Tejun Heo committed
293
static bool wq_online;			/* can kworkers be created yet? */
294

295 296
static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */

297 298 299
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;

300
static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
301
static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
302
static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
303

304
static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
305
static bool workqueue_freezing;		/* PL: have wqs started freezing? */
306

307 308 309 310 311
/* PL: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
312

313 314 315 316 317 318 319 320 321 322 323 324
/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

325
/* the per-cpu worker pools */
326
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
327

328
static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */
329

330
/* PL: hash of all unbound pools keyed by pool->attrs */
331 332
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

333
/* I: attributes used when instantiating standard unbound pools on demand */
334 335
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

336 337 338
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

339
struct workqueue_struct *system_wq __read_mostly;
340
EXPORT_SYMBOL(system_wq);
341
struct workqueue_struct *system_highpri_wq __read_mostly;
342
EXPORT_SYMBOL_GPL(system_highpri_wq);
343
struct workqueue_struct *system_long_wq __read_mostly;
344
EXPORT_SYMBOL_GPL(system_long_wq);
345
struct workqueue_struct *system_unbound_wq __read_mostly;
346
EXPORT_SYMBOL_GPL(system_unbound_wq);
347
struct workqueue_struct *system_freezable_wq __read_mostly;
348
EXPORT_SYMBOL_GPL(system_freezable_wq);
349 350 351 352
struct workqueue_struct *system_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
353

354
static int worker_thread(void *__worker);
355
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
356

357 358 359
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

360
#define assert_rcu_or_pool_mutex()					\
361 362 363
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq_pool_mutex),		\
			 "sched RCU or wq_pool_mutex should be held")
364

365
#define assert_rcu_or_wq_mutex(wq)					\
366 367 368
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq->mutex),			\
			 "sched RCU or wq->mutex should be held")
369

370
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
371 372 373 374
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq->mutex) &&		\
			 !lockdep_is_held(&wq_pool_mutex),		\
			 "sched RCU, wq->mutex or wq_pool_mutex should be held")
375

376 377 378
#define for_each_cpu_worker_pool(pool, cpu)				\
	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
379
	     (pool)++)
380

381 382 383
/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
384
 * @pi: integer used for iteration
385
 *
386 387 388
 * This must be called either with wq_pool_mutex held or sched RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
389 390 391
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
392
 */
393 394
#define for_each_pool(pool, pi)						\
	idr_for_each_entry(&worker_pool_idr, pool, pi)			\
395
		if (({ assert_rcu_or_pool_mutex(); false; })) { }	\
396
		else
397

398 399 400 401 402
/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
403
 * This must be called with @pool->attach_mutex.
404 405 406 407
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
408 409
#define for_each_pool_worker(worker, pool)				\
	list_for_each_entry((worker), &(pool)->workers, node)		\
410
		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
411 412
		else

413 414 415 416
/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
417
 *
418
 * This must be called either with wq->mutex held or sched RCU read locked.
419 420
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
421 422 423
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
424 425
 */
#define for_each_pwq(pwq, wq)						\
426
	list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)		\
427
		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
428
		else
429

430 431 432 433
#ifdef CONFIG_DEBUG_OBJECTS_WORK

static struct debug_obj_descr work_debug_descr;

434 435 436 437 438
static void *work_debug_hint(void *addr)
{
	return ((struct work_struct *) addr)->func;
}

439 440 441 442 443 444 445
static bool work_is_static_object(void *addr)
{
	struct work_struct *work = addr;

	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

446 447 448 449
/*
 * fixup_init is called when:
 * - an active object is initialized
 */
450
static bool work_fixup_init(void *addr, enum debug_obj_state state)
451 452 453 454 455 456 457
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_init(work, &work_debug_descr);
458
		return true;
459
	default:
460
		return false;
461 462 463 464 465 466 467
	}
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
468
static bool work_fixup_free(void *addr, enum debug_obj_state state)
469 470 471 472 473 474 475
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_free(work, &work_debug_descr);
476
		return true;
477
	default:
478
		return false;
479 480 481 482 483
	}
}

static struct debug_obj_descr work_debug_descr = {
	.name		= "work_struct",
484
	.debug_hint	= work_debug_hint,
485
	.is_static_object = work_is_static_object,
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
	.fixup_init	= work_fixup_init,
	.fixup_free	= work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
	debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
	debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
	if (onstack)
		debug_object_init_on_stack(work, &work_debug_descr);
	else
		debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
	debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

515 516 517 518 519 520 521
void destroy_delayed_work_on_stack(struct delayed_work *work)
{
	destroy_timer_on_stack(&work->timer);
	debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

522 523 524 525 526
#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

527 528 529 530 531 532 533
/**
 * worker_pool_assign_id - allocate ID and assing it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
Tejun Heo's avatar
Tejun Heo committed
534 535 536 537
static int worker_pool_assign_id(struct worker_pool *pool)
{
	int ret;

538
	lockdep_assert_held(&wq_pool_mutex);
539

540 541
	ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
			GFP_KERNEL);
542
	if (ret >= 0) {
Tejun Heo's avatar
Tejun Heo committed
543
		pool->id = ret;
544 545
		return 0;
	}
546
	return ret;
547 548
}

549 550 551 552 553
/**
 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
 * @wq: the target workqueue
 * @node: the node ID
 *
554 555
 * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
 * read locked.
556 557
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
558 559
 *
 * Return: The unbound pool_workqueue for @node.
560 561 562 563
 */
static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
						  int node)
{
564
	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
565 566 567 568 569 570 571 572 573 574

	/*
	 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
	 * delayed item is pending.  The plan is to keep CPU -> NODE
	 * mapping valid and stable across CPU on/offlines.  Once that
	 * happens, this workaround can be removed.
	 */
	if (unlikely(node == NUMA_NO_NODE))
		return wq->dfl_pwq;

575 576 577
	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}

578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
static unsigned int work_color_to_flags(int color)
{
	return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(struct work_struct *work)
{
	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
		((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
	return (color + 1) % WORK_NR_COLORS;
}
Linus Torvalds's avatar
Linus Torvalds committed
593

594
/*
595 596
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
597
 * is cleared and the high bits contain OFFQ flags and pool ID.
598
 *
599 600
 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
 * and clear_work_data() can be used to set the pwq, pool or clear
601 602
 * work->data.  These functions should only be called while the work is
 * owned - ie. while the PENDING bit is set.
603
 *
604
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
605
 * corresponding to a work.  Pool is available once the work has been
606
 * queued anywhere after initialization until it is sync canceled.  pwq is
607
 * available only while the work item is queued.
608
 *
609 610 611 612
 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 * canceled.  While being canceled, a work item may have its PENDING set
 * but stay off timer and worklist for arbitrarily long and nobody should
 * try to steal the PENDING bit.
613
 */
614 615
static inline void set_work_data(struct work_struct *work, unsigned long data,
				 unsigned long flags)
616
{
617
	WARN_ON_ONCE(!work_pending(work));
618 619
	atomic_long_set(&work->data, data | flags | work_static(work));
}
620

621
static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
622 623
			 unsigned long extra_flags)
{
624 625
	set_work_data(work, (unsigned long)pwq,
		      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
626 627
}

628 629 630 631 632 633 634
static void set_work_pool_and_keep_pending(struct work_struct *work,
					   int pool_id)
{
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
		      WORK_STRUCT_PENDING);
}

635 636
static void set_work_pool_and_clear_pending(struct work_struct *work,
					    int pool_id)
637
{
638 639 640 641 642 643 644
	/*
	 * The following wmb is paired with the implied mb in
	 * test_and_set_bit(PENDING) and ensures all updates to @work made
	 * here are visible to and precede any updates by the next PENDING
	 * owner.
	 */
	smp_wmb();
645
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
	/*
	 * The following mb guarantees that previous clear of a PENDING bit
	 * will not be reordered with any speculative LOADS or STORES from
	 * work->current_func, which is executed afterwards.  This possible
	 * reordering can lead to a missed execution on attempt to qeueue
	 * the same @work.  E.g. consider this case:
	 *
	 *   CPU#0                         CPU#1
	 *   ----------------------------  --------------------------------
	 *
	 * 1  STORE event_indicated
	 * 2  queue_work_on() {
	 * 3    test_and_set_bit(PENDING)
	 * 4 }                             set_..._and_clear_pending() {
	 * 5                                 set_work_data() # clear bit
	 * 6                                 smp_mb()
	 * 7                               work->current_func() {
	 * 8				      LOAD event_indicated
	 *				   }
	 *
	 * Without an explicit full barrier speculative LOAD on line 8 can
	 * be executed before CPU#0 does STORE on line 1.  If that happens,
	 * CPU#0 observes the PENDING bit is still set and new execution of
	 * a @work is not queued in a hope, that CPU#1 will eventually
	 * finish the queued @work.  Meanwhile CPU#1 does not see
	 * event_indicated is set, because speculative LOAD was executed
	 * before actual STORE.
	 */
	smp_mb();
675
}
676

677
static void clear_work_data(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
678
{
679 680
	smp_wmb();	/* see set_work_pool_and_clear_pending() */
	set_work_data(work, WORK_STRUCT_NO_POOL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
681 682
}

683
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
684
{
685
	unsigned long data = atomic_long_read(&work->data);
686

687
	if (data & WORK_STRUCT_PWQ)
688 689 690
		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
	else
		return NULL;
691 692
}

693 694 695 696
/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
697 698 699
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under sched-RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or with preemption disabled.
700 701 702 703 704
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
705 706
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
707 708
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
709
{
710
	unsigned long data = atomic_long_read(&work->data);
711
	int pool_id;
712

713
	assert_rcu_or_pool_mutex();
714

715 716
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
717
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool;
718

719 720
	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
	if (pool_id == WORK_OFFQ_POOL_NONE)
721 722
		return NULL;

723
	return idr_find(&worker_pool_idr, pool_id);
724 725 726 727 728 729
}

/**
 * get_work_pool_id - return the worker pool ID a given work is associated with
 * @work: the work item of interest
 *
730
 * Return: The worker_pool ID @work was last associated with.
731 732 733 734
 * %WORK_OFFQ_POOL_NONE if none.
 */
static int get_work_pool_id(struct work_struct *work)
{
735 736
	unsigned long data = atomic_long_read(&work->data);

737 738
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
739
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
740

741
	return data >> WORK_OFFQ_POOL_SHIFT;
742 743
}

744 745
static void mark_work_canceling(struct work_struct *work)
{
746
	unsigned long pool_id = get_work_pool_id(work);
747

748 749
	pool_id <<= WORK_OFFQ_POOL_SHIFT;
	set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
750 751 752 753 754 755
}

static bool work_is_canceling(struct work_struct *work)
{
	unsigned long data = atomic_long_read(&work->data);

756
	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
757 758
}

759
/*
760 761
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
762
 * they're being called with pool->lock held.
763 764
 */

765
static bool __need_more_worker(struct worker_pool *pool)
766
{
767
	return !atomic_read(&pool->nr_running);
768 769
}

770
/*
771 772
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
773 774
 *
 * Note that, because unbound workers never contribute to nr_running, this
775
 * function will always return %true for unbound pools as long as the
776
 * worklist isn't empty.
777
 */
778
static bool need_more_worker(struct worker_pool *pool)
779
{
780
	return !list_empty(&pool->worklist) && __need_more_worker(pool);
781
}
782

783
/* Can I start working?  Called from busy but !running workers. */
784
static bool may_start_working(struct worker_pool *pool)
785
{
786
	return pool->nr_idle;
787 788 789
}

/* Do I need to keep working?  Called from currently running workers. */
790
static bool keep_working(struct worker_pool *pool)
791
{
792 793
	return !list_empty(&pool->worklist) &&
		atomic_read(&pool->nr_running) <= 1;
794 795 796
}

/* Do we need a new worker?  Called from manager. */
797
static bool need_to_create_worker(struct worker_pool *pool)
798
{
799
	return need_more_worker(pool) && !may_start_working(pool);
800
}
801

802
/* Do we have too many workers and should some go away? */
803
static bool too_many_workers(struct worker_pool *pool)
804
{
805
	bool managing = pool->flags & POOL_MANAGER_ACTIVE;
806 807
	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
	int nr_busy = pool->nr_workers - nr_idle;
808 809

	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
810 811
}

812
/*
813 814 815
 * Wake up functions.
 */

816 817
/* Return the first idle worker.  Safe with preemption disabled */
static struct worker *first_idle_worker(struct worker_pool *pool)
818
{
819
	if (unlikely(list_empty(&pool->idle_list)))
820 821
		return NULL;

822
	return list_first_entry(&pool->idle_list, struct worker, entry);
823 824 825 826
}

/**
 * wake_up_worker - wake up an idle worker
827
 * @pool: worker pool to wake worker from
828
 *
829
 * Wake up the first idle worker of @pool.
830 831
 *
 * CONTEXT:
832
 * spin_lock_irq(pool->lock).
833
 */
834
static void wake_up_worker(struct worker_pool *pool)
835
{
836
	struct worker *worker = first_idle_worker(pool);
837 838 839 840 841

	if (likely(worker))
		wake_up_process(worker->task);
}

842
/**
843 844 845 846 847 848 849 850 851 852
 * wq_worker_waking_up - a worker is waking up
 * @task: task waking up
 * @cpu: CPU @task is waking up to
 *
 * This function is called during try_to_wake_up() when a worker is
 * being awoken.
 *
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 */
853
void wq_worker_waking_up(struct task_struct *task, int cpu)
854 855 856
{
	struct worker *worker = kthread_data(task);

857
	if (!(worker->flags & WORKER_NOT_RUNNING)) {
858
		WARN_ON_ONCE(worker->pool->cpu != cpu);
859
		atomic_inc(&worker->pool->nr_running);
860
	}
861 862 863 864 865 866 867 868 869 870 871 872 873
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
 * This function is called during schedule() when a busy worker is
 * going to sleep.  Worker on the same cpu can be woken up by
 * returning pointer to its task.
 *
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 *
874
 * Return:
875 876
 * Worker task on @cpu to wake up, %NULL if none.
 */
877
struct task_struct *wq_worker_sleeping(struct task_struct *task)
878 879
{
	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
880
	struct worker_pool *pool;
881

882 883 884 885 886
	/*
	 * Rescuers, which may not have all the fields set up like normal
	 * workers, also reach here, let's not access anything before
	 * checking NOT_RUNNING.
	 */
887
	if (worker->flags & WORKER_NOT_RUNNING)
888 889
		return NULL;

890 891
	pool = worker->pool;

892
	/* this can only happen on the local cpu */
893
	if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
894
		return NULL;
895 896 897 898 899 900

	/*
	 * The counterpart of the following dec_and_test, implied mb,
	 * worklist not empty test sequence is in insert_work().
	 * Please read comment there.
	 *
901 902 903
	 * NOT_RUNNING is clear.  This means that we're bound to and
	 * running on the local cpu w/ rq lock held and preemption
	 * disabled, which in turn means that none else could be
904
	 * manipulating idle_list, so dereferencing idle_list without pool
905
	 * lock is safe.
906
	 */
907 908
	if (atomic_dec_and_test(&pool->nr_running) &&
	    !list_empty(&pool->worklist))
909
		to_wakeup = first_idle_worker(pool);