core.c 167 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
2
 *  kernel/sched/core.c
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Ingo Molnar's avatar
Ingo Molnar committed
4
 *  Core kernel scheduler code and related syscalls
Linus Torvalds's avatar
Linus Torvalds committed
5 6 7
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 */
8
#include <linux/sched.h>
9
#include <linux/sched/clock.h>
10
#include <uapi/linux/sched/types.h>
11
#include <linux/sched/loadavg.h>
12
#include <linux/sched/hotplug.h>
13
#include <linux/wait_bit.h>
Linus Torvalds's avatar
Linus Torvalds committed
14
#include <linux/cpuset.h>
15
#include <linux/delayacct.h>
16
#include <linux/init_task.h>
17
#include <linux/context_tracking.h>
18
#include <linux/rcupdate_wait.h>
19
#include <linux/ipipe.h>
20 21 22 23 24 25

#include <linux/blkdev.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/module.h>
#include <linux/nmi.h>
26
#include <linux/prefetch.h>
27 28 29
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
Linus Torvalds's avatar
Linus Torvalds committed
30

31
#include <asm/switch_to.h>
32
#include <asm/tlb.h>
33 34 35
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
Linus Torvalds's avatar
Linus Torvalds committed
36

37
#include "sched.h"
38
#include "../workqueue_internal.h"
39
#include "../smpboot.h"
40

41
#define CREATE_TRACE_POINTS
42
#include <trace/events/sched.h>
43

44
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
45

46 47 48
/*
 * Debugging: various feature bits
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
49 50 51 52

#define SCHED_FEAT(name, enabled)	\
	(1UL << __SCHED_FEAT_##name) * enabled |

53
const_debug unsigned int sysctl_sched_features =
54
#include "features.h"
Peter Zijlstra's avatar
Peter Zijlstra committed
55 56 57 58
	0;

#undef SCHED_FEAT

59 60 61 62 63 64
/*
 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.
 */
const_debug unsigned int sysctl_sched_nr_migrate = 32;

65 66 67 68 69 70 71 72
/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
 * default: 1s
 */
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

Peter Zijlstra's avatar
Peter Zijlstra committed
73
/*
Ingo Molnar's avatar
Ingo Molnar committed
74
 * period over which we measure -rt task CPU usage in us.
Peter Zijlstra's avatar
Peter Zijlstra committed
75 76
 * default: 1s
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
77
unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstra's avatar
Peter Zijlstra committed
78

79
__read_mostly int scheduler_running;
80

Peter Zijlstra's avatar
Peter Zijlstra committed
81 82 83 84 85
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;
Peter Zijlstra's avatar
Peter Zijlstra committed
86

Ingo Molnar's avatar
Ingo Molnar committed
87
/* CPUs with isolated domains */
88 89
cpumask_var_t cpu_isolated_map;

90 91 92
/*
 * __task_rq_lock - lock the rq @p resides on.
 */
93
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
94 95 96 97 98 99 100 101 102 103
	__acquires(rq->lock)
{
	struct rq *rq;

	lockdep_assert_held(&p->pi_lock);

	for (;;) {
		rq = task_rq(p);
		raw_spin_lock(&rq->lock);
		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
104
			rq_pin_lock(rq, rf);
105 106 107 108 109 110 111 112 113 114 115 116
			return rq;
		}
		raw_spin_unlock(&rq->lock);

		while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();
	}
}

/*
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 */
117
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
118 119 120 121 122 123
	__acquires(p->pi_lock)
	__acquires(rq->lock)
{
	struct rq *rq;

	for (;;) {
124
		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
		rq = task_rq(p);
		raw_spin_lock(&rq->lock);
		/*
		 *	move_queued_task()		task_rq_lock()
		 *
		 *	ACQUIRE (rq->lock)
		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
		 *	[S] ->cpu = new_cpu		[L] task_rq()
		 *					[L] ->on_rq
		 *	RELEASE (rq->lock)
		 *
		 * If we observe the old cpu in task_rq_lock, the acquire of
		 * the old rq->lock will fully serialize against the stores.
		 *
Ingo Molnar's avatar
Ingo Molnar committed
140
		 * If we observe the new CPU in task_rq_lock, the acquire will
141 142 143
		 * pair with the WMB to ensure we must then also see migrating.
		 */
		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
144
			rq_pin_lock(rq, rf);
145 146 147
			return rq;
		}
		raw_spin_unlock(&rq->lock);
148
		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
149 150 151 152 153 154

		while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();
	}
}

155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
/*
 * RQ-clock updating methods:
 */

static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
 * In theory, the compile should just see 0 here, and optimize out the call
 * to sched_rt_avg_update. But I don't trust it...
 */
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	s64 steal = 0, irq_delta = 0;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

	/*
	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
	 * this case when a previous update_rq_clock() happened inside a
	 * {soft,}irq region.
	 *
	 * When this happens, we stop ->clock_task and only update the
	 * prev_irq_time stamp to account for the part that fit, so that a next
	 * update will consume the rest. This ensures ->clock_task is
	 * monotonic.
	 *
	 * It does however cause some slight miss-attribution of {soft,}irq
	 * time, a more accurate solution would be to update the irq_time using
	 * the current rq->clock timestamp, except that would require using
	 * atomic ops.
	 */
	if (irq_delta > delta)
		irq_delta = delta;

	rq->prev_irq_time += irq_delta;
	delta -= irq_delta;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
	if (static_key_false((&paravirt_steal_rq_enabled))) {
		steal = paravirt_steal_clock(cpu_of(rq));
		steal -= rq->prev_steal_time_rq;

		if (unlikely(steal > delta))
			steal = delta;

		rq->prev_steal_time_rq += steal;
		delta -= steal;
	}
#endif

	rq->clock_task += delta;

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
		sched_rt_avg_update(rq, irq_delta + steal);
#endif
}

void update_rq_clock(struct rq *rq)
{
	s64 delta;

	lockdep_assert_held(&rq->lock);

	if (rq->clock_update_flags & RQCF_ACT_SKIP)
		return;

#ifdef CONFIG_SCHED_DEBUG
223 224
	if (sched_feat(WARN_DOUBLE_CLOCK))
		SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
225 226
	rq->clock_update_flags |= RQCF_UPDATED;
#endif
227

228 229 230 231 232 233 234 235
	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
	if (delta < 0)
		return;
	rq->clock += delta;
	update_rq_clock_task(rq, delta);
}


236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
#ifdef CONFIG_SCHED_HRTICK
/*
 * Use HR-timers to deliver accurate preemption points.
 */

static void hrtick_clear(struct rq *rq)
{
	if (hrtimer_active(&rq->hrtick_timer))
		hrtimer_cancel(&rq->hrtick_timer);
}

/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
254
	struct rq_flags rf;
255 256 257

	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

258
	rq_lock(rq, &rf);
259
	update_rq_clock(rq);
260
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
261
	rq_unlock(rq, &rf);
262 263 264 265

	return HRTIMER_NORESTART;
}

266
#ifdef CONFIG_SMP
Peter Zijlstra's avatar
Peter Zijlstra committed
267

268
static void __hrtick_restart(struct rq *rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
269 270 271
{
	struct hrtimer *timer = &rq->hrtick_timer;

272
	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra's avatar
Peter Zijlstra committed
273 274
}

275 276 277 278
/*
 * called from hardirq (IPI) context
 */
static void __hrtick_start(void *arg)
279
{
280
	struct rq *rq = arg;
281
	struct rq_flags rf;
282

283
	rq_lock(rq, &rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
284
	__hrtick_restart(rq);
285
	rq->hrtick_csd_pending = 0;
286
	rq_unlock(rq, &rf);
287 288
}

289 290 291 292 293
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
294
void hrtick_start(struct rq *rq, u64 delay)
295
{
296
	struct hrtimer *timer = &rq->hrtick_timer;
297 298 299 300 301 302 303 304 305
	ktime_t time;
	s64 delta;

	/*
	 * Don't schedule slices shorter than 10000ns, that just
	 * doesn't make sense and can cause timer DoS.
	 */
	delta = max_t(s64, delay, 10000LL);
	time = ktime_add_ns(timer->base->get_time(), delta);
306

307
	hrtimer_set_expires(timer, time);
308 309

	if (rq == this_rq()) {
Peter Zijlstra's avatar
Peter Zijlstra committed
310
		__hrtick_restart(rq);
311
	} else if (!rq->hrtick_csd_pending) {
312
		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
313 314
		rq->hrtick_csd_pending = 1;
	}
315 316
}

317 318 319 320 321 322
#else
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
323
void hrtick_start(struct rq *rq, u64 delay)
324
{
Wanpeng Li's avatar
Wanpeng Li committed
325 326 327 328 329
	/*
	 * Don't schedule slices shorter than 10000ns, that just
	 * doesn't make sense. Rely on vruntime for fairness.
	 */
	delay = max_t(u64, delay, 10000LL);
330 331
	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
		      HRTIMER_MODE_REL_PINNED);
332 333
}
#endif /* CONFIG_SMP */
334

335
static void init_rq_hrtick(struct rq *rq)
336
{
337 338
#ifdef CONFIG_SMP
	rq->hrtick_csd_pending = 0;
339

340 341 342 343
	rq->hrtick_csd.flags = 0;
	rq->hrtick_csd.func = __hrtick_start;
	rq->hrtick_csd.info = rq;
#endif
344

345 346
	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rq->hrtick_timer.function = hrtick;
347
}
Andrew Morton's avatar
Andrew Morton committed
348
#else	/* CONFIG_SCHED_HRTICK */
349 350 351 352 353 354 355
static inline void hrtick_clear(struct rq *rq)
{
}

static inline void init_rq_hrtick(struct rq *rq)
{
}
Andrew Morton's avatar
Andrew Morton committed
356
#endif	/* CONFIG_SCHED_HRTICK */
357

358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
/*
 * cmpxchg based fetch_or, macro so it works for different integer types
 */
#define fetch_or(ptr, mask)						\
	({								\
		typeof(ptr) _ptr = (ptr);				\
		typeof(mask) _mask = (mask);				\
		typeof(*_ptr) _old, _val = *_ptr;			\
									\
		for (;;) {						\
			_old = cmpxchg(_ptr, _val, _val | _mask);	\
			if (_old == _val)				\
				break;					\
			_val = _old;					\
		}							\
	_old;								\
})

376
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
377 378 379 380 381 382 383 384 385 386
/*
 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
 * this avoids any races wrt polling state changes and thereby avoids
 * spurious IPIs.
 */
static bool set_nr_and_not_polling(struct task_struct *p)
{
	struct thread_info *ti = task_thread_info(p);
	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
387 388 389 390 391 392 393 394 395 396

/*
 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
 *
 * If this returns true, then the idle task promises to call
 * sched_ttwu_pending() and reschedule soon.
 */
static bool set_nr_if_polling(struct task_struct *p)
{
	struct thread_info *ti = task_thread_info(p);
397
	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
398 399 400 401 402 403 404 405 406 407 408 409 410 411

	for (;;) {
		if (!(val & _TIF_POLLING_NRFLAG))
			return false;
		if (val & _TIF_NEED_RESCHED)
			return true;
		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
		if (old == val)
			break;
		val = old;
	}
	return true;
}

412 413 414 415 416 417
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
	set_tsk_need_resched(p);
	return true;
}
418 419 420 421 422 423 424

#ifdef CONFIG_SMP
static bool set_nr_if_polling(struct task_struct *p)
{
	return false;
}
#endif
425 426
#endif

427 428 429 430 431 432 433 434 435 436
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
	struct wake_q_node *node = &task->wake_q;

	/*
	 * Atomically grab the task, if ->wake_q is !nil already it means
	 * its already queued (either by us or someone else) and will get the
	 * wakeup due to that.
	 *
	 * This cmpxchg() implies a full barrier, which pairs with the write
437
	 * barrier implied by the wakeup in wake_up_q().
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
	 */
	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
		return;

	get_task_struct(task);

	/*
	 * The head is context local, there can be no concurrency.
	 */
	*head->lastp = node;
	head->lastp = &node->next;
}

void wake_up_q(struct wake_q_head *head)
{
	struct wake_q_node *node = head->first;

	while (node != WAKE_Q_TAIL) {
		struct task_struct *task;

		task = container_of(node, struct task_struct, wake_q);
		BUG_ON(!task);
Ingo Molnar's avatar
Ingo Molnar committed
460
		/* Task can safely be re-inserted now: */
461 462 463 464 465 466 467 468 469 470 471 472
		node = node->next;
		task->wake_q.next = NULL;

		/*
		 * wake_up_process() implies a wmb() to pair with the queueing
		 * in wake_q_add() so as not to miss wakeups.
		 */
		wake_up_process(task);
		put_task_struct(task);
	}
}

473
/*
474
 * resched_curr - mark rq's current task 'to be rescheduled now'.
475 476 477 478 479
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
480
void resched_curr(struct rq *rq)
481
{
482
	struct task_struct *curr = rq->curr;
483 484
	int cpu;

485
	lockdep_assert_held(&rq->lock);
486

487
	if (test_tsk_need_resched(curr))
488 489
		return;

490
	cpu = cpu_of(rq);
491

492
	if (cpu == smp_processor_id()) {
493
		set_tsk_need_resched(curr);
494
		set_preempt_need_resched();
495
		return;
496
	}
497

498
	if (set_nr_and_not_polling(curr))
499
		smp_send_reschedule(cpu);
500 501
	else
		trace_sched_wake_idle_without_ipi(cpu);
502 503
}

504
void resched_cpu(int cpu)
505 506 507 508
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long flags;

509
	raw_spin_lock_irqsave(&rq->lock, flags);
510 511
	if (cpu_online(cpu) || cpu == smp_processor_id())
		resched_curr(rq);
512
	raw_spin_unlock_irqrestore(&rq->lock, flags);
513
}
514

515
#ifdef CONFIG_SMP
516
#ifdef CONFIG_NO_HZ_COMMON
517
/*
Ingo Molnar's avatar
Ingo Molnar committed
518 519
 * In the semi idle case, use the nearest busy CPU for migrating timers
 * from an idle CPU.  This is good for power-savings.
520 521
 *
 * We don't do similar optimization for completely idle system, as
Ingo Molnar's avatar
Ingo Molnar committed
522 523
 * selecting an idle CPU will add more delays to the timers than intended
 * (as that CPU's timer base may not be uptodate wrt jiffies etc).
524
 */
525
int get_nohz_timer_target(void)
526
{
527
	int i, cpu = smp_processor_id();
528 529
	struct sched_domain *sd;

530
	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
531 532
		return cpu;

533
	rcu_read_lock();
534
	for_each_domain(cpu, sd) {
535
		for_each_cpu(i, sched_domain_span(sd)) {
536 537 538 539
			if (cpu == i)
				continue;

			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
540 541 542 543
				cpu = i;
				goto unlock;
			}
		}
544
	}
545 546 547

	if (!is_housekeeping_cpu(cpu))
		cpu = housekeeping_any_cpu();
548 549
unlock:
	rcu_read_unlock();
550 551
	return cpu;
}
Ingo Molnar's avatar
Ingo Molnar committed
552

553 554 555 556 557 558 559 560 561 562
/*
 * When add_timer_on() enqueues a timer into the timer wheel of an
 * idle CPU then this timer might expire before the next timer event
 * which is scheduled to wake up that CPU. In case of a completely
 * idle system the next event might even be infinite time into the
 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
 * leaves the inner idle loop so the newly added timer is taken into
 * account when the CPU goes back to idle and evaluates the timer
 * wheel for the next timer event.
 */
563
static void wake_up_idle_cpu(int cpu)
564 565 566 567 568 569
{
	struct rq *rq = cpu_rq(cpu);

	if (cpu == smp_processor_id())
		return;

570
	if (set_nr_and_not_polling(rq->idle))
571
		smp_send_reschedule(cpu);
572 573
	else
		trace_sched_wake_idle_without_ipi(cpu);
574 575
}

576
static bool wake_up_full_nohz_cpu(int cpu)
577
{
578 579 580 581 582 583
	/*
	 * We just need the target to call irq_exit() and re-evaluate
	 * the next tick. The nohz full kick at least implies that.
	 * If needed we can still optimize that later with an
	 * empty IRQ.
	 */
584 585
	if (cpu_is_offline(cpu))
		return true;  /* Don't try to wake offline CPUs. */
586
	if (tick_nohz_full_cpu(cpu)) {
587 588
		if (cpu != smp_processor_id() ||
		    tick_nohz_tick_stopped())
589
			tick_nohz_full_kick_cpu(cpu);
590 591 592 593 594 595
		return true;
	}

	return false;
}

596 597 598 599 600
/*
 * Wake up the specified CPU.  If the CPU is going offline, it is the
 * caller's responsibility to deal with the lost wakeup, for example,
 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
 */
601 602
void wake_up_nohz_cpu(int cpu)
{
603
	if (!wake_up_full_nohz_cpu(cpu))
604 605 606
		wake_up_idle_cpu(cpu);
}

607
static inline bool got_nohz_idle_kick(void)
608
{
609
	int cpu = smp_processor_id();
610 611 612 613 614 615 616 617 618 619 620 621 622

	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
		return false;

	if (idle_cpu(cpu) && !need_resched())
		return true;

	/*
	 * We can't run Idle Load Balance on this CPU for this time so we
	 * cancel it and clear NOHZ_BALANCE_KICK
	 */
	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
	return false;
623 624
}

625
#else /* CONFIG_NO_HZ_COMMON */
626

627
static inline bool got_nohz_idle_kick(void)
Peter Zijlstra's avatar
Peter Zijlstra committed
628
{
629
	return false;
Peter Zijlstra's avatar
Peter Zijlstra committed
630 631
}

632
#endif /* CONFIG_NO_HZ_COMMON */
633

634
#ifdef CONFIG_NO_HZ_FULL
635
bool sched_can_stop_tick(struct rq *rq)
636
{
637 638 639 640 641 642
	int fifo_nr_running;

	/* Deadline tasks, even if single, need the tick */
	if (rq->dl.dl_nr_running)
		return false;

643
	/*
644 645
	 * If there are more than one RR tasks, we need the tick to effect the
	 * actual RR behaviour.
646
	 */
647 648 649 650 651
	if (rq->rt.rr_nr_running) {
		if (rq->rt.rr_nr_running == 1)
			return true;
		else
			return false;
652 653
	}

654 655 656 657 658 659 660 661 662 663 664 665 666 667
	/*
	 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
	 * forced preemption between FIFO tasks.
	 */
	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
	if (fifo_nr_running)
		return true;

	/*
	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
	 * if there's more than one we need the tick for involuntary
	 * preemption.
	 */
	if (rq->nr_running > 1)
668
		return false;
669

670
	return true;
671 672
}
#endif /* CONFIG_NO_HZ_FULL */
673

674
void sched_avg_update(struct rq *rq)
675
{
676 677
	s64 period = sched_avg_period();

678
	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
679 680 681 682 683 684
		/*
		 * Inline assembly required to prevent the compiler
		 * optimising this loop into a divmod call.
		 * See __iter_div_u64_rem() for another example of this.
		 */
		asm("" : "+rm" (rq->age_stamp));
685 686 687
		rq->age_stamp += period;
		rq->rt_avg /= 2;
	}
688 689
}

690
#endif /* CONFIG_SMP */
691

692 693
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
694
/*
695 696 697 698
 * Iterate task_group tree rooted at *from, calling @down when first entering a
 * node and @up when leaving it for the final time.
 *
 * Caller must hold rcu_lock or sufficient equivalent.
699
 */
700
int walk_tg_tree_from(struct task_group *from,
701
			     tg_visitor down, tg_visitor up, void *data)
702 703
{
	struct task_group *parent, *child;
Peter Zijlstra's avatar
Peter Zijlstra committed
704
	int ret;
705

706 707
	parent = from;

708
down:
Peter Zijlstra's avatar
Peter Zijlstra committed
709 710
	ret = (*down)(parent, data);
	if (ret)
711
		goto out;
712 713 714 715 716 717 718
	list_for_each_entry_rcu(child, &parent->children, siblings) {
		parent = child;
		goto down;

up:
		continue;
	}
Peter Zijlstra's avatar
Peter Zijlstra committed
719
	ret = (*up)(parent, data);
720 721
	if (ret || parent == from)
		goto out;
722 723 724 725 726

	child = parent;
	parent = parent->parent;
	if (parent)
		goto up;
727
out:
Peter Zijlstra's avatar
Peter Zijlstra committed
728
	return ret;
729 730
}

731
int tg_nop(struct task_group *tg, void *data)
Peter Zijlstra's avatar
Peter Zijlstra committed
732
{
733
	return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
734
}
735 736
#endif

737 738
static void set_load_weight(struct task_struct *p)
{
Nikhil Rao's avatar
Nikhil Rao committed
739 740 741
	int prio = p->static_prio - MAX_RT_PRIO;
	struct load_weight *load = &p->se.load;

Ingo Molnar's avatar
Ingo Molnar committed
742 743 744
	/*
	 * SCHED_IDLE tasks get minimal weight:
	 */
745
	if (idle_policy(p->policy)) {
746
		load->weight = scale_load(WEIGHT_IDLEPRIO);
Nikhil Rao's avatar
Nikhil Rao committed
747
		load->inv_weight = WMULT_IDLEPRIO;
Ingo Molnar's avatar
Ingo Molnar committed
748 749
		return;
	}
750

751 752
	load->weight = scale_load(sched_prio_to_weight[prio]);
	load->inv_weight = sched_prio_to_wmult[prio];
753 754
}

755
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
756
{
757 758 759
	if (!(flags & ENQUEUE_NOCLOCK))
		update_rq_clock(rq);

760 761
	if (!(flags & ENQUEUE_RESTORE))
		sched_info_queued(rq, p);
762

763
	p->sched_class->enqueue_task(rq, p, flags);
764 765
}

766
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
767
{
768 769 770
	if (!(flags & DEQUEUE_NOCLOCK))
		update_rq_clock(rq);

771 772
	if (!(flags & DEQUEUE_SAVE))
		sched_info_dequeued(rq, p);
773

774
	p->sched_class->dequeue_task(rq, p, flags);
775 776
}

777
void activate_task(struct rq *rq, struct task_struct *p, int flags)
778 779 780 781
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible--;

782
	enqueue_task(rq, p, flags);
783 784
}

785
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
786 787 788 789
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible++;

790
	dequeue_task(rq, p, flags);
791 792
}

793
/*
Ingo Molnar's avatar
Ingo Molnar committed
794
 * __normal_prio - return the priority that is based on the static prio
795 796 797
 */
static inline int __normal_prio(struct task_struct *p)
{
Ingo Molnar's avatar
Ingo Molnar committed
798
	return p->static_prio;
799 800
}

801 802 803 804 805 806 807
/*
 * Calculate the expected normal priority: i.e. priority
 * without taking RT-inheritance into account. Might be
 * boosted by interactivity modifiers. Changes upon fork,
 * setprio syscalls, and whenever the interactivity
 * estimator recalculates.
 */
808
static inline int normal_prio(struct task_struct *p)
809 810 811
{
	int prio;

812 813 814
	if (task_has_dl_policy(p))
		prio = MAX_DL_PRIO-1;
	else if (task_has_rt_policy(p))
815 816 817 818 819 820 821 822 823 824 825 826 827
		prio = MAX_RT_PRIO-1 - p->rt_priority;
	else
		prio = __normal_prio(p);
	return prio;
}

/*
 * Calculate the current priority, i.e. the priority
 * taken into account by the scheduler. This value might
 * be boosted by RT tasks, or might be boosted by
 * interactivity modifiers. Will be RT if the task got
 * RT-boosted. If not then it returns p->normal_prio.
 */
828
static int effective_prio(struct task_struct *p)
829 830 831 832 833 834 835 836 837 838 839 840
{
	p->normal_prio = normal_prio(p);
	/*
	 * If we are RT tasks or we were boosted to RT priority,
	 * keep the priority unchanged. Otherwise, update priority
	 * to the normal priority:
	 */
	if (!rt_prio(p->prio))
		return p->normal_prio;
	return p->prio;
}

Linus Torvalds's avatar
Linus Torvalds committed
841 842 843
/**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
844 845
 *
 * Return: 1 if the task is currently executing. 0 otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
846
 */
847
inline int task_curr(const struct task_struct *p)
Linus Torvalds's avatar
Linus Torvalds committed
848 849 850 851
{
	return cpu_curr(task_cpu(p)) == p;
}

852
/*
853 854 855 856 857
 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
 * use the balance_callback list if you want balancing.
 *
 * this means any call to check_class_changed() must be followed by a call to
 * balance_callback().
858
 */
859 860
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
				       const struct sched_class *prev_class,
Peter Zijlstra's avatar
Peter Zijlstra committed
861
				       int oldprio)
862 863 864
{
	if (prev_class != p->sched_class) {
		if (prev_class->switched_from)
Peter Zijlstra's avatar
Peter Zijlstra committed
865
			prev_class->switched_from(rq, p);
866

Peter Zijlstra's avatar
Peter Zijlstra committed
867
		p->sched_class->switched_to(rq, p);
868
	} else if (oldprio != p->prio || dl_task(p))
Peter Zijlstra's avatar
Peter Zijlstra committed
869
		p->sched_class->prio_changed(rq, p, oldprio);
870 871
}

872
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
873 874 875 876 877 878 879 880 881 882
{
	const struct sched_class *class;

	if (p->sched_class == rq->curr->sched_class) {
		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
	} else {
		for_each_class(class) {
			if (class == rq->curr->sched_class)
				break;
			if (class == p->sched_class) {
883
				resched_curr(rq);
884 885 886 887 888 889 890 891 892
				break;
			}
		}
	}

	/*
	 * A queue event has occurred, and we're going to schedule.  In
	 * this case, we can save a useless back to back clock update.
	 */
893
	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
894
		rq_clock_skip_update(rq, true);
895 896
}

Linus Torvalds's avatar
Linus Torvalds committed
897
#ifdef CONFIG_SMP
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924

static inline bool is_per_cpu_kthread(struct task_struct *p)
{
	if (!(p->flags & PF_KTHREAD))
		return false;

	if (p->nr_cpus_allowed != 1)
		return false;

	return true;
}

/*
 * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
 * __set_cpus_allowed_ptr() and select_fallback_rq().
 */
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
		return false;

	if (is_per_cpu_kthread(p))
		return cpu_online(cpu);

	return cpu_active(cpu);
}

Peter Zijlstra's avatar
Peter Zijlstra committed
925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943
/*
 * This is how migration works:
 *
 * 1) we invoke migration_cpu_stop() on the target CPU using
 *    stop_one_cpu().
 * 2) stopper starts to run (implicitly forcing the migrated thread
 *    off the CPU)
 * 3) it checks whether the migrated task is still in the wrong runqueue.
 * 4) if it's in the wrong runqueue then the migration thread removes
 *    it and puts it into the right queue.
 * 5) stopper completes and stop_one_cpu() returns and the migration
 *    is done.
 */

/*
 * move_queued_task - move a queued task to new rq.
 *
 * Returns (locked) new rq. Old rq's lock is released.
 */
944 945
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
				   struct task_struct *p, int new_cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
946 947 948 949
{
	lockdep_assert_held(&rq->lock);

	p->on_rq = TASK_ON_RQ_MIGRATING;
950
	dequeue_task(rq, p, DEQUEUE_NOCLOCK);
Peter Zijlstra's avatar
Peter Zijlstra committed
951
	set_task_cpu(p, new_cpu);
952
	rq_unlock(rq, rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
953 954 955

	rq = cpu_rq(new_cpu);

956
	rq_lock(rq, rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
957 958
	BUG_ON(task_cpu(p) != new_cpu);
	enqueue_task(rq, p, 0);
959
	p->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstra's avatar
Peter Zijlstra committed
960 961 962 963 964 965 966 967 968 969 970
	check_preempt_curr(rq, p, 0);

	return rq;
}

struct migration_arg {
	struct task_struct *task;
	int dest_cpu;
};

/*
Ingo Molnar's avatar
Ingo Molnar committed
971
 * Move (not current) task off this CPU, onto the destination CPU. We're doing
Peter Zijlstra's avatar
Peter Zijlstra committed
972 973 974 975 976 977 978
 * this because either it can't run here any more (set_cpus_allowed()
 * away from this CPU, or CPU going down), or because we're
 * attempting to rebalance this task on exec (sched_exec).
 *
 * So we race with normal scheduler movements, but that's OK, as long
 * as the task is no longer on this CPU.
 */
979 980
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
				 struct task_struct *p, int dest_cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
981 982
{
	/* Affinity changed (again). */
983
	if (!is_cpu_allowed(p, dest_cpu))
984
		return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
985

986
	update_rq_clock(rq);
987
	rq = move_queued_task(rq, rf, p, dest_cpu);
988 989

	return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
990 991 992 993 994 995 996 997 998 999
}

/*
 * migration_cpu_stop - this will be executed by a highprio stopper thread
 * and performs thread migration by bumping thread off CPU then
 * 'pushing' onto another runqueue.
 */
static int migration_cpu_stop(void *data)
{
	struct migration_arg *arg = data;
1000 1001
	struct task_struct *p = arg->task;
	struct rq *rq = this_rq();
1002
	struct rq_flags rf;
Peter Zijlstra's avatar
Peter Zijlstra committed
1003 1004

	/*
Ingo Molnar's avatar
Ingo Molnar committed
1005 1006
	 * The original target CPU might have gone down and we might
	 * be on another CPU but it doesn't matter.
Peter Zijlstra's avatar
Peter Zijlstra committed
1007 1008 1009 1010 1011 1012 1013 1014
	 */
	local_irq_disable();
	/*
	 * We need to explicitly wake pending tasks before running
	 * __migrate_task() such that we will not miss enforcing cpus_allowed
	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
	 */
	sched_ttwu_pending();
1015 1016

	raw_spin_lock(&p->pi_lock);
1017
	rq_lock(rq, &rf);
1018 1019 1020 1021 1022
	/*
	 * If task_rq(p) != rq, it cannot be migrated here, because we're
	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
	 * we're holding p->pi_lock.
	 */
1023 1024
	if (task_rq(p) == rq) {
		if (task_on_rq_queued(p))
1025
			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1026 1027 1028
		else
			p->wake_cpu = arg->dest_cpu;
	}
1029
	rq_unlock(rq, &rf);
1030 1031
	raw_spin_unlock(&p->pi_lock);

Peter Zijlstra's avatar
Peter Zijlstra committed
1032 1033 1034 1035
	local_irq_enable();
	return 0;
}

1036 1037 1038 1039 1040
/*
 * sched_class::set_cpus_allowed must do the below, but is not required to
 * actually call this function.
 */
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
Peter Zijlstra's avatar
Peter Zijlstra committed
1041 1042 1043 1044 1045
{
	cpumask_copy(&p->cpus_allowed, new_mask);
	p->nr_cpus_allowed = cpumask_weight(new_mask);
}

1046 1047
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
1048 1049 1050
	struct rq *rq = task_rq(p);
	bool queued, running;

1051
	lockdep_assert_held(&p->pi_lock);
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061

	queued = task_on_rq_queued(p);
	running = task_current(rq, p);

	if (queued) {
		/*
		 * Because __kthread_bind() calls this on blocked tasks without
		 * holding rq->lock.
		 */
		lockdep_assert_held(&rq->lock);
1062
		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1063 1064 1065 1066
	}
	if (running)
		put_prev_task(rq, p);

1067
	p->sched_class->set_cpus_allowed(p, new_mask);
1068 1069

	if (queued)
1070
		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1071
	if (running)
1072
		set_curr_task(rq, p);
1073 1074
}

Peter Zijlstra's avatar
Peter Zijlstra committed
1075 1076 1077 1078 1079 1080 1081 1082 1083
/*
 * Change a given task's CPU affinity. Migrate the thread to a
 * proper CPU and schedule it away if the CPU it's executing on
 * is removed from the allowed bitmask.
 *
 * NOTE: the caller must have a valid reference to the task, the
 * task must not exit() & deallocate itself prematurely. The
 * call is not atomic; no spinlocks may be held.
 */
1084 1085
static int __set_cpus_allowed_ptr(struct task_struct *p,
				  const struct cpumask *new_mask, bool check)
Peter Zijlstra's avatar
Peter Zijlstra committed
1086
{
1087
	const struct cpumask *cpu_valid_mask = cpu_active_mask;
Peter Zijlstra's avatar
Peter Zijlstra committed
1088
	unsigned int dest_cpu;
1089 </