Commit e78c3496 authored by Rik van Riel's avatar Rik van Riel Committed by Ingo Molnar

time, signal: Protect resource use statistics with seqlock

Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability
issues on large systems, due to both functions being serialized with a

The lock protects against reporting a wrong value, due to a thread in the
task group exiting, its statistics reporting up to the signal struct, and
that exited task's statistics being counted twice (or not at all).

Protecting that with a lock results in times() and clock_gettime() being
completely serialized on large systems.

This can be fixed by using a seqlock around the events that gather and
propagate statistics. As an additional benefit, the protection code can
be moved into thread_group_cputime(), slightly simplifying the calling

In the case of posix_cpu_clock_get_task() things can be simplified a
lot, because the calling function already ensures that the task sticks
around, and the rest is now taken care of in thread_group_cputime().

This way the statistics reporting code can run lockless.
Signed-off-by: default avatarRik van Riel <>
Signed-off-by: default avatarPeter Zijlstra (Intel) <>
Cc: Alex Thorlton <>
Cc: Andrew Morton <>
Cc: Daeseok Youn <>
Cc: David Rientjes <>
Cc: Dongsheng Yang <>
Cc: Geert Uytterhoeven <>
Cc: Guillaume Morin <>
Cc: Ionut Alexa <>
Cc: Kees Cook <>
Cc: Linus Torvalds <>
Cc: Li Zefan <>
Cc: Michal Hocko <>
Cc: Michal Schmidt <>
Cc: Oleg Nesterov <>
Cc: Vladimir Davydov <>
Link: default avatarIngo Molnar <>
parent 90ed9cbe
......@@ -645,6 +645,7 @@ struct signal_struct {
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
seqlock_t stats_lock;
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
......@@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk)
* the signal_struct.
task_cputime(tsk, &utime, &stime);
sig->utime += utime;
sig->stime += stime;
sig->gtime += task_gtime(tsk);
......@@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
__unhash_process(tsk, group_dead);
* Do this under ->siglock, we can race with another thread
......@@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig = p->real_parent->signal;
sig = p->signal;
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
......@@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
......@@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->curr_target = tsk;
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
......@@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
struct signal_struct *sig = tsk->signal;
cputime_t utime, stime;
struct task_struct *t;
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
unsigned int seq, nextseq;
for_each_thread(tsk, t) {
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
/* Attempt a lockless read on the first round. */
nextseq = 0;
do {
seq = nextseq;
read_seqbegin_or_lock(&sig->stats_lock, &seq);
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
for_each_thread(tsk, t) {
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
/* If lockless access failed, take the lock. */
nextseq = 1;
} while (need_seqretry(&sig->stats_lock, seq));
done_seqretry(&sig->stats_lock, seq);
......@@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
* Must be called with siglock held.
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
struct task_cputime cputime;
......@@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms)
cputime_t tgutime, tgstime, cutime, cstime;
thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
tms->tms_utime = cputime_to_clock_t(tgutime);
tms->tms_stime = cputime_to_clock_t(tgstime);
tms->tms_cutime = cputime_to_clock_t(cutime);
......@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
if (same_thread_group(tsk, current))
err = cpu_clock_sample(which_clock, tsk, &rtn);
} else {
unsigned long flags;
struct sighand_struct *sighand;
* while_each_thread() is not yet entirely RCU safe,
* keep locking the group while sampling process
* clock for now.
sighand = lock_task_sighand(tsk, &flags);
if (!sighand)
return err;
if (tsk == current || thread_group_leader(tsk))
err = cpu_clock_sample_group(which_clock, tsk, &rtn);
unlock_task_sighand(tsk, &flags);
if (!err)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment