Commit 0f8c7901 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Pull workqueue update from Tejun Heo:
 "Workqueue changes for v4.5.  One cleanup patch and three to improve
  the debuggability.

  Workqueue now has a stall detector which dumps workqueue state if any
  worker pool hasn't made forward progress over a certain amount of time
  (30s by default) and also triggers a warning if a workqueue which can
  be used in memory reclaim path tries to wait on something which can't
  be.

  These should make workqueue hangs a lot easier to debug."

* 'for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
  workqueue: simplify the apply_workqueue_attrs_locked()
  workqueue: implement lockup detector
  watchdog: introduce touch_softlockup_watchdog_sched()
  workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue
parents 3d116a66 6201171e
...@@ -4140,6 +4140,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ...@@ -4140,6 +4140,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
or other driver-specific files in the or other driver-specific files in the
Documentation/watchdog/ directory. Documentation/watchdog/ directory.
workqueue.watchdog_thresh=
If CONFIG_WQ_WATCHDOG is configured, workqueue can
warn stall conditions and dump internal state to
help debugging. 0 disables workqueue stall
detection; otherwise, it's the stall threshold
duration in seconds. The default value is 30 and
it can be updated at runtime by writing to the
corresponding sysfs file.
workqueue.disable_numa workqueue.disable_numa
By default, all work items queued to unbound By default, all work items queued to unbound
workqueues are affine to the NUMA nodes they're workqueues are affine to the NUMA nodes they're
......
...@@ -377,6 +377,7 @@ extern void scheduler_tick(void); ...@@ -377,6 +377,7 @@ extern void scheduler_tick(void);
extern void sched_show_task(struct task_struct *p); extern void sched_show_task(struct task_struct *p);
#ifdef CONFIG_LOCKUP_DETECTOR #ifdef CONFIG_LOCKUP_DETECTOR
extern void touch_softlockup_watchdog_sched(void);
extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog(void);
extern void touch_softlockup_watchdog_sync(void); extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void); extern void touch_all_softlockup_watchdogs(void);
...@@ -387,6 +388,9 @@ extern unsigned int softlockup_panic; ...@@ -387,6 +388,9 @@ extern unsigned int softlockup_panic;
extern unsigned int hardlockup_panic; extern unsigned int hardlockup_panic;
void lockup_detector_init(void); void lockup_detector_init(void);
#else #else
static inline void touch_softlockup_watchdog_sched(void)
{
}
static inline void touch_softlockup_watchdog(void) static inline void touch_softlockup_watchdog(void)
{ {
} }
......
...@@ -618,4 +618,10 @@ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) ...@@ -618,4 +618,10 @@ static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
{ return 0; } { return 0; }
#endif /* CONFIG_SYSFS */ #endif /* CONFIG_SYSFS */
#ifdef CONFIG_WQ_WATCHDOG
void wq_watchdog_touch(int cpu);
#else /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_touch(int cpu) { }
#endif /* CONFIG_WQ_WATCHDOG */
#endif #endif
...@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) ...@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
return; return;
sched_clock_tick(); sched_clock_tick();
touch_softlockup_watchdog(); touch_softlockup_watchdog_sched();
} }
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
......
...@@ -143,7 +143,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) ...@@ -143,7 +143,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
* when we go busy again does not account too much ticks. * when we go busy again does not account too much ticks.
*/ */
if (ts->tick_stopped) { if (ts->tick_stopped) {
touch_softlockup_watchdog(); touch_softlockup_watchdog_sched();
if (is_idle_task(current)) if (is_idle_task(current))
ts->idle_jiffies++; ts->idle_jiffies++;
} }
...@@ -430,7 +430,7 @@ static void tick_nohz_update_jiffies(ktime_t now) ...@@ -430,7 +430,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
tick_do_update_jiffies64(now); tick_do_update_jiffies64(now);
local_irq_restore(flags); local_irq_restore(flags);
touch_softlockup_watchdog(); touch_softlockup_watchdog_sched();
} }
/* /*
...@@ -717,7 +717,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int ...@@ -717,7 +717,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int
update_cpu_load_nohz(active); update_cpu_load_nohz(active);
calc_load_exit_idle(); calc_load_exit_idle();
touch_softlockup_watchdog(); touch_softlockup_watchdog_sched();
/* /*
* Cancel the scheduled timer and restore the tick * Cancel the scheduled timer and restore the tick
*/ */
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/smpboot.h> #include <linux/smpboot.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
#include <linux/tick.h> #include <linux/tick.h>
#include <linux/workqueue.h>
#include <asm/irq_regs.h> #include <asm/irq_regs.h>
#include <linux/kvm_para.h> #include <linux/kvm_para.h>
...@@ -225,7 +226,15 @@ static void __touch_watchdog(void) ...@@ -225,7 +226,15 @@ static void __touch_watchdog(void)
__this_cpu_write(watchdog_touch_ts, get_timestamp()); __this_cpu_write(watchdog_touch_ts, get_timestamp());
} }
void touch_softlockup_watchdog(void) /**
* touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
*
* Call when the scheduler may have stalled for legitimate reasons
* preventing the watchdog task from executing - e.g. the scheduler
* entering idle state. This should only be used for scheduler events.
* Use touch_softlockup_watchdog() for everything else.
*/
void touch_softlockup_watchdog_sched(void)
{ {
/* /*
* Preemption can be enabled. It doesn't matter which CPU's timestamp * Preemption can be enabled. It doesn't matter which CPU's timestamp
...@@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void) ...@@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void)
*/ */
raw_cpu_write(watchdog_touch_ts, 0); raw_cpu_write(watchdog_touch_ts, 0);
} }
void touch_softlockup_watchdog(void)
{
touch_softlockup_watchdog_sched();
wq_watchdog_touch(raw_smp_processor_id());
}
EXPORT_SYMBOL(touch_softlockup_watchdog); EXPORT_SYMBOL(touch_softlockup_watchdog);
void touch_all_softlockup_watchdogs(void) void touch_all_softlockup_watchdogs(void)
...@@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void) ...@@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void)
*/ */
for_each_watchdog_cpu(cpu) for_each_watchdog_cpu(cpu)
per_cpu(watchdog_touch_ts, cpu) = 0; per_cpu(watchdog_touch_ts, cpu) = 0;
wq_watchdog_touch(-1);
} }
#ifdef CONFIG_HARDLOCKUP_DETECTOR #ifdef CONFIG_HARDLOCKUP_DETECTOR
......
...@@ -148,6 +148,8 @@ struct worker_pool { ...@@ -148,6 +148,8 @@ struct worker_pool {
int id; /* I: pool ID */ int id; /* I: pool ID */
unsigned int flags; /* X: flags */ unsigned int flags; /* X: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
struct list_head worklist; /* L: list of pending works */ struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */ int nr_workers; /* L: total number of workers */
...@@ -1083,6 +1085,8 @@ static void pwq_activate_delayed_work(struct work_struct *work) ...@@ -1083,6 +1085,8 @@ static void pwq_activate_delayed_work(struct work_struct *work)
struct pool_workqueue *pwq = get_work_pwq(work); struct pool_workqueue *pwq = get_work_pwq(work);
trace_workqueue_activate_work(work); trace_workqueue_activate_work(work);
if (list_empty(&pwq->pool->worklist))
pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL); move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
pwq->nr_active++; pwq->nr_active++;
...@@ -1385,6 +1389,8 @@ retry: ...@@ -1385,6 +1389,8 @@ retry:
trace_workqueue_activate_work(work); trace_workqueue_activate_work(work);
pwq->nr_active++; pwq->nr_active++;
worklist = &pwq->pool->worklist; worklist = &pwq->pool->worklist;
if (list_empty(worklist))
pwq->pool->watchdog_ts = jiffies;
} else { } else {
work_flags |= WORK_STRUCT_DELAYED; work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works; worklist = &pwq->delayed_works;
...@@ -2157,6 +2163,8 @@ recheck: ...@@ -2157,6 +2163,8 @@ recheck:
list_first_entry(&pool->worklist, list_first_entry(&pool->worklist,
struct work_struct, entry); struct work_struct, entry);
pool->watchdog_ts = jiffies;
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */ /* optimization path, not strictly necessary */
process_one_work(worker, work); process_one_work(worker, work);
...@@ -2240,6 +2248,7 @@ repeat: ...@@ -2240,6 +2248,7 @@ repeat:
struct pool_workqueue, mayday_node); struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool; struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n; struct work_struct *work, *n;
bool first = true;
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node); list_del_init(&pwq->mayday_node);
...@@ -2256,9 +2265,14 @@ repeat: ...@@ -2256,9 +2265,14 @@ repeat:
* process'em. * process'em.
*/ */
WARN_ON_ONCE(!list_empty(scheduled)); WARN_ON_ONCE(!list_empty(scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry) list_for_each_entry_safe(work, n, &pool->worklist, entry) {
if (get_work_pwq(work) == pwq) if (get_work_pwq(work) == pwq) {
if (first)
pool->watchdog_ts = jiffies;
move_linked_works(work, scheduled, &n); move_linked_works(work, scheduled, &n);
}
first = false;
}
if (!list_empty(scheduled)) { if (!list_empty(scheduled)) {
process_scheduled_works(rescuer); process_scheduled_works(rescuer);
...@@ -2316,6 +2330,37 @@ repeat: ...@@ -2316,6 +2330,37 @@ repeat:
goto repeat; goto repeat;
} }
/**
* check_flush_dependency - check for flush dependency sanity
* @target_wq: workqueue being flushed
* @target_work: work item being flushed (NULL for workqueue flushes)
*
* %current is trying to flush the whole @target_wq or @target_work on it.
* If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
* reclaiming memory or running on a workqueue which doesn't have
* %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
* a deadlock.
*/
static void check_flush_dependency(struct workqueue_struct *target_wq,
struct work_struct *target_work)
{
work_func_t target_func = target_work ? target_work->func : NULL;
struct worker *worker;
if (target_wq->flags & WQ_MEM_RECLAIM)
return;
worker = current_wq_worker();
WARN_ONCE(current->flags & PF_MEMALLOC,
"workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
current->pid, current->comm, target_wq->name, target_func);
WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM),
"workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
worker->current_pwq->wq->name, worker->current_func,
target_wq->name, target_func);
}
struct wq_barrier { struct wq_barrier {
struct work_struct work; struct work_struct work;
struct completion done; struct completion done;
...@@ -2525,6 +2570,8 @@ void flush_workqueue(struct workqueue_struct *wq) ...@@ -2525,6 +2570,8 @@ void flush_workqueue(struct workqueue_struct *wq)
list_add_tail(&this_flusher.list, &wq->flusher_overflow); list_add_tail(&this_flusher.list, &wq->flusher_overflow);
} }
check_flush_dependency(wq, NULL);
mutex_unlock(&wq->mutex); mutex_unlock(&wq->mutex);
wait_for_completion(&this_flusher.done); wait_for_completion(&this_flusher.done);
...@@ -2697,6 +2744,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) ...@@ -2697,6 +2744,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
pwq = worker->current_pwq; pwq = worker->current_pwq;
} }
check_flush_dependency(pwq->wq, work);
insert_wq_barrier(pwq, barr, work, worker); insert_wq_barrier(pwq, barr, work, worker);
spin_unlock_irq(&pool->lock); spin_unlock_irq(&pool->lock);
...@@ -3069,6 +3118,7 @@ static int init_worker_pool(struct worker_pool *pool) ...@@ -3069,6 +3118,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->cpu = -1; pool->cpu = -1;
pool->node = NUMA_NO_NODE; pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED; pool->flags |= POOL_DISASSOCIATED;
pool->watchdog_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list); INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash); hash_init(pool->busy_hash);
...@@ -3601,7 +3651,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, ...@@ -3601,7 +3651,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs) const struct workqueue_attrs *attrs)
{ {
struct apply_wqattrs_ctx *ctx; struct apply_wqattrs_ctx *ctx;
int ret = -ENOMEM;
/* only unbound workqueues can change attributes */ /* only unbound workqueues can change attributes */
if (WARN_ON(!(wq->flags & WQ_UNBOUND))) if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
...@@ -3612,16 +3661,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, ...@@ -3612,16 +3661,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL; return -EINVAL;
ctx = apply_wqattrs_prepare(wq, attrs); ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
return -ENOMEM;
/* the ctx has been prepared successfully, let's commit it */ /* the ctx has been prepared successfully, let's commit it */
if (ctx) { apply_wqattrs_commit(ctx);
apply_wqattrs_commit(ctx);
ret = 0;
}
apply_wqattrs_cleanup(ctx); apply_wqattrs_cleanup(ctx);
return ret; return 0;
} }
/** /**
...@@ -4308,7 +4355,9 @@ void show_workqueue_state(void) ...@@ -4308,7 +4355,9 @@ void show_workqueue_state(void)
pr_info("pool %d:", pool->id); pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool); pr_cont_pool_info(pool);
pr_cont(" workers=%d", pool->nr_workers); pr_cont(" hung=%us workers=%d",
jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
pool->nr_workers);
if (pool->manager) if (pool->manager)
pr_cont(" manager: %d", pr_cont(" manager: %d",
task_pid_nr(pool->manager->task)); task_pid_nr(pool->manager->task));
...@@ -5167,6 +5216,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) ...@@ -5167,6 +5216,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
#endif /* CONFIG_SYSFS */ #endif /* CONFIG_SYSFS */
/*
* Workqueue watchdog.
*
* Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
* flush dependency, a concurrency managed work item which stays RUNNING
* indefinitely. Workqueue stalls can be very difficult to debug as the
* usual warning mechanisms don't trigger and internal workqueue state is
* largely opaque.
*
* Workqueue watchdog monitors all worker pools periodically and dumps
* state if some pools failed to make forward progress for a while where
* forward progress is defined as the first item on ->worklist changing.
*
* This mechanism is controlled through the kernel parameter
* "workqueue.watchdog_thresh" which can be updated at runtime through the
* corresponding sysfs parameter file.
*/
#ifdef CONFIG_WQ_WATCHDOG
static void wq_watchdog_timer_fn(unsigned long data);
static unsigned long wq_watchdog_thresh = 30;
static struct timer_list wq_watchdog_timer =
TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
static void wq_watchdog_reset_touched(void)
{
int cpu;
wq_watchdog_touched = jiffies;
for_each_possible_cpu(cpu)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}
static void wq_watchdog_timer_fn(unsigned long data)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
bool lockup_detected = false;
struct worker_pool *pool;
int pi;
if (!thresh)
return;
rcu_read_lock();
for_each_pool(pool, pi) {
unsigned long pool_ts, touched, ts;
if (list_empty(&pool->worklist))
continue;
/* get the latest of pool and touched timestamps */
pool_ts = READ_ONCE(pool->watchdog_ts);
touched = READ_ONCE(wq_watchdog_touched);
if (time_after(pool_ts, touched))
ts = pool_ts;
else
ts = touched;
if (pool->cpu >= 0) {
unsigned long cpu_touched =
READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
pool->cpu));
if (time_after(cpu_touched, ts))
ts = cpu_touched;
}
/* did we stall? */
if (time_after(jiffies, ts + thresh)) {
lockup_detected = true;
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
pr_cont(" stuck for %us!\n",
jiffies_to_msecs(jiffies - pool_ts) / 1000);
}
}
rcu_read_unlock();
if (lockup_detected)
show_workqueue_state();
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);
}
void wq_watchdog_touch(int cpu)
{
if (cpu >= 0)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
else
wq_watchdog_touched = jiffies;
}
static void wq_watchdog_set_thresh(unsigned long thresh)
{
wq_watchdog_thresh = 0;
del_timer_sync(&wq_watchdog_timer);
if (thresh) {
wq_watchdog_thresh = thresh;
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
}
}
static int wq_watchdog_param_set_thresh(const char *val,
const struct kernel_param *kp)
{
unsigned long thresh;
int ret;
ret = kstrtoul(val, 0, &thresh);
if (ret)
return ret;
if (system_wq)
wq_watchdog_set_thresh(thresh);
else
wq_watchdog_thresh = thresh;
return 0;
}
static const struct kernel_param_ops wq_watchdog_thresh_ops = {
.set = wq_watchdog_param_set_thresh,
.get = param_get_ulong,
};
module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
0644);
static void wq_watchdog_init(void)
{
wq_watchdog_set_thresh(wq_watchdog_thresh);
}
#else /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_init(void) { }
#endif /* CONFIG_WQ_WATCHDOG */
static void __init wq_numa_init(void) static void __init wq_numa_init(void)
{ {
cpumask_var_t *tbl; cpumask_var_t *tbl;
...@@ -5290,6 +5487,9 @@ static int __init init_workqueues(void) ...@@ -5290,6 +5487,9 @@ static int __init init_workqueues(void)
!system_unbound_wq || !system_freezable_wq || !system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq || !system_power_efficient_wq ||
!system_freezable_power_efficient_wq); !system_freezable_power_efficient_wq);
wq_watchdog_init();
return 0; return 0;
} }
early_initcall(init_workqueues); early_initcall(init_workqueues);
...@@ -812,6 +812,17 @@ config BOOTPARAM_HUNG_TASK_PANIC_VALUE ...@@ -812,6 +812,17 @@ config BOOTPARAM_HUNG_TASK_PANIC_VALUE
default 0 if !BOOTPARAM_HUNG_TASK_PANIC default 0 if !BOOTPARAM_HUNG_TASK_PANIC
default 1 if BOOTPARAM_HUNG_TASK_PANIC default 1 if BOOTPARAM_HUNG_TASK_PANIC
config WQ_WATCHDOG
bool "Detect Workqueue Stalls"
depends on DEBUG_KERNEL
help
Say Y here to enable stall detection on workqueues. If a
worker pool doesn't make forward progress on a pending work
item for over a given amount of time, 30s by default, a
warning message is printed along with dump of workqueue
state. This can be configured through kernel parameter
"workqueue.watchdog_thresh" and its sysfs counterpart.
endmenu # "Debug lockups and hangs" endmenu # "Debug lockups and hangs"
config PANIC_ON_OOPS config PANIC_ON_OOPS
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment