cpuset.c 76.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6
/*
 *  kernel/cpuset.c
 *
 *  Processor and Memory placement constraints for sets of tasks.
 *
 *  Copyright (C) 2003 BULL SA.
7
 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8
 *  Copyright (C) 2006 Google, Inc
Linus Torvalds's avatar
Linus Torvalds committed
9 10 11 12
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
13
 *  2003-10-10 Written by Simon Derr.
Linus Torvalds's avatar
Linus Torvalds committed
14
 *  2003-10-22 Updates by Stephen Hemminger.
15
 *  2004 May-July Rework by Paul Jackson.
16
 *  2006 Rework by Paul Menage to use generic cgroups
17 18
 *  2008 Rework of the scheduler domains and CPU hotplug handling
 *       by Max Krasnyansky
Linus Torvalds's avatar
Linus Torvalds committed
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/list.h>
37
#include <linux/mempolicy.h>
Linus Torvalds's avatar
Linus Torvalds committed
38
#include <linux/mm.h>
39
#include <linux/memory.h>
40
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
41 42 43 44
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
45
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
46 47
#include <linux/sched.h>
#include <linux/seq_file.h>
48
#include <linux/security.h>
Linus Torvalds's avatar
Linus Torvalds committed
49 50 51 52 53
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
54
#include <linux/time64.h>
Linus Torvalds's avatar
Linus Torvalds committed
55 56 57 58
#include <linux/backing-dev.h>
#include <linux/sort.h>

#include <asm/uaccess.h>
59
#include <linux/atomic.h>
60
#include <linux/mutex.h>
61
#include <linux/cgroup.h>
62
#include <linux/wait.h>
Linus Torvalds's avatar
Linus Torvalds committed
63

64
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
65
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
66

67 68 69 70 71
/* See "Frequency meter" comments, below. */

struct fmeter {
	int cnt;		/* unprocessed events count */
	int val;		/* most recent output value */
72
	time64_t time;		/* clock (secs) when val computed */
73 74 75
	spinlock_t lock;	/* guards read or write of above */
};

Linus Torvalds's avatar
Linus Torvalds committed
76
struct cpuset {
77 78
	struct cgroup_subsys_state css;

Linus Torvalds's avatar
Linus Torvalds committed
79
	unsigned long flags;		/* "unsigned long" so bitops work */
80

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
	/*
	 * On default hierarchy:
	 *
	 * The user-configured masks can only be changed by writing to
	 * cpuset.cpus and cpuset.mems, and won't be limited by the
	 * parent masks.
	 *
	 * The effective masks is the real masks that apply to the tasks
	 * in the cpuset. They may be changed if the configured masks are
	 * changed or hotplug happens.
	 *
	 * effective_mask == configured_mask & parent's effective_mask,
	 * and if it ends up empty, it will inherit the parent's mask.
	 *
	 *
	 * On legacy hierachy:
	 *
	 * The user-configured masks are always the same with effective masks.
	 */

101 102 103 104 105 106 107
	/* user-configured CPUs and Memory Nodes allow to tasks */
	cpumask_var_t cpus_allowed;
	nodemask_t mems_allowed;

	/* effective CPUs and Memory Nodes allow to tasks */
	cpumask_var_t effective_cpus;
	nodemask_t effective_mems;
Linus Torvalds's avatar
Linus Torvalds committed
108

109 110 111 112 113 114 115 116 117 118 119 120
	/*
	 * This is old Memory Nodes tasks took on.
	 *
	 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
	 * - A new cpuset's old_mems_allowed is initialized when some
	 *   task is moved into it.
	 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
	 *   cpuset.mems_allowed and have tasks' nodemask updated, and
	 *   then old_mems_allowed is updated to mems_allowed.
	 */
	nodemask_t old_mems_allowed;

121
	struct fmeter fmeter;		/* memory_pressure filter */
122

123 124 125 126 127 128
	/*
	 * Tasks are being attached to this cpuset.  Used to prevent
	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
	 */
	int attach_in_progress;

129 130
	/* partition number for rebuild_sched_domains() */
	int pn;
131

132 133
	/* for custom sched domain */
	int relax_domain_level;
Linus Torvalds's avatar
Linus Torvalds committed
134 135
};

136
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
137
{
138
	return css ? container_of(css, struct cpuset, css) : NULL;
139 140 141 142 143
}

/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
144
	return css_cs(task_css(task, cpuset_cgrp_id));
145 146
}

147
static inline struct cpuset *parent_cs(struct cpuset *cs)
148
{
Tejun Heo's avatar
Tejun Heo committed
149
	return css_cs(cs->css.parent);
150 151
}

152 153 154 155 156 157 158 159 160 161 162 163 164
#ifdef CONFIG_NUMA
static inline bool task_has_mempolicy(struct task_struct *task)
{
	return task->mempolicy;
}
#else
static inline bool task_has_mempolicy(struct task_struct *task)
{
	return false;
}
#endif


Linus Torvalds's avatar
Linus Torvalds committed
165 166
/* bits in struct cpuset flags field */
typedef enum {
Tejun Heo's avatar
Tejun Heo committed
167
	CS_ONLINE,
Linus Torvalds's avatar
Linus Torvalds committed
168 169
	CS_CPU_EXCLUSIVE,
	CS_MEM_EXCLUSIVE,
170
	CS_MEM_HARDWALL,
171
	CS_MEMORY_MIGRATE,
172
	CS_SCHED_LOAD_BALANCE,
173 174
	CS_SPREAD_PAGE,
	CS_SPREAD_SLAB,
Linus Torvalds's avatar
Linus Torvalds committed
175 176 177
} cpuset_flagbits_t;

/* convenient tests for these bits */
178
static inline bool is_cpuset_online(struct cpuset *cs)
Tejun Heo's avatar
Tejun Heo committed
179
{
180
	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
Tejun Heo's avatar
Tejun Heo committed
181 182
}

Linus Torvalds's avatar
Linus Torvalds committed
183 184
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
185
	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
Linus Torvalds's avatar
Linus Torvalds committed
186 187 188 189
}

static inline int is_mem_exclusive(const struct cpuset *cs)
{
190
	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
Linus Torvalds's avatar
Linus Torvalds committed
191 192
}

193 194 195 196 197
static inline int is_mem_hardwall(const struct cpuset *cs)
{
	return test_bit(CS_MEM_HARDWALL, &cs->flags);
}

198 199 200 201 202
static inline int is_sched_load_balance(const struct cpuset *cs)
{
	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}

203 204
static inline int is_memory_migrate(const struct cpuset *cs)
{
205
	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
206 207
}

208 209 210 211 212 213 214 215 216 217
static inline int is_spread_page(const struct cpuset *cs)
{
	return test_bit(CS_SPREAD_PAGE, &cs->flags);
}

static inline int is_spread_slab(const struct cpuset *cs)
{
	return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

Linus Torvalds's avatar
Linus Torvalds committed
218
static struct cpuset top_cpuset = {
Tejun Heo's avatar
Tejun Heo committed
219 220
	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
		  (1 << CS_MEM_EXCLUSIVE)),
Linus Torvalds's avatar
Linus Torvalds committed
221 222
};

223 224 225
/**
 * cpuset_for_each_child - traverse online children of a cpuset
 * @child_cs: loop cursor pointing to the current child
226
 * @pos_css: used for iteration
227 228 229 230 231
 * @parent_cs: target cpuset to walk children of
 *
 * Walk @child_cs through the online children of @parent_cs.  Must be used
 * with RCU read locked.
 */
232 233 234
#define cpuset_for_each_child(child_cs, pos_css, parent_cs)		\
	css_for_each_child((pos_css), &(parent_cs)->css)		\
		if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
235

236 237 238
/**
 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 * @des_cs: loop cursor pointing to the current descendant
239
 * @pos_css: used for iteration
240 241 242
 * @root_cs: target cpuset to walk ancestor of
 *
 * Walk @des_cs through the online descendants of @root_cs.  Must be used
243
 * with RCU read locked.  The caller may modify @pos_css by calling
244 245
 * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 * iteration and the first node to be visited.
246
 */
247 248 249
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)	\
	css_for_each_descendant_pre((pos_css), &(root_cs)->css)		\
		if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
250

Linus Torvalds's avatar
Linus Torvalds committed
251
/*
252 253 254 255
 * There are two global locks guarding cpuset structures - cpuset_mutex and
 * callback_lock. We also require taking task_lock() when dereferencing a
 * task's cpuset pointer. See "The task_lock() exception", at the end of this
 * comment.
256
 *
257
 * A task must hold both locks to modify cpusets.  If a task holds
258
 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
259
 * is the only task able to also acquire callback_lock and be able to
260 261 262
 * modify cpusets.  It can perform various checks on the cpuset structure
 * first, knowing nothing will change.  It can also allocate memory while
 * just holding cpuset_mutex.  While it is performing these checks, various
263 264
 * callback routines can briefly acquire callback_lock to query cpusets.
 * Once it is ready to make the changes, it takes callback_lock, blocking
265
 * everyone else.
266 267
 *
 * Calls to the kernel memory allocator can not be made while holding
268
 * callback_lock, as that would risk double tripping on callback_lock
269 270 271
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
272
 * If a task is only holding callback_lock, then it has read-only
273 274
 * access to cpusets.
 *
275 276 277
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
278
 *
279
 * The cpuset_common_file_read() handlers only hold callback_lock across
280 281 282
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
283 284
 * Accessing a task's cpuset should be done in accordance with the
 * guidelines for accessing subsystem state in kernel/cgroup.c
Linus Torvalds's avatar
Linus Torvalds committed
285 286
 */

287
static DEFINE_MUTEX(cpuset_mutex);
288
static DEFINE_SPINLOCK(callback_lock);
289

290 291
static struct workqueue_struct *cpuset_migrate_mm_wq;

292 293 294 295 296 297
/*
 * CPU / memory hotplug is handled asynchronously.
 */
static void cpuset_hotplug_workfn(struct work_struct *work);
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

298 299
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);

300 301
/*
 * This is ugly, but preserves the userspace API for existing cpuset
302
 * users. If someone tries to mount the "cpuset" filesystem, we
303 304
 * silently switch it to mount "cgroup" instead
 */
Al Viro's avatar
Al Viro committed
305 306
static struct dentry *cpuset_mount(struct file_system_type *fs_type,
			 int flags, const char *unused_dev_name, void *data)
Linus Torvalds's avatar
Linus Torvalds committed
307
{
308
	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
Al Viro's avatar
Al Viro committed
309
	struct dentry *ret = ERR_PTR(-ENODEV);
310 311 312 313
	if (cgroup_fs) {
		char mountopts[] =
			"cpuset,noprefix,"
			"release_agent=/sbin/cpuset_release_agent";
Al Viro's avatar
Al Viro committed
314 315
		ret = cgroup_fs->mount(cgroup_fs, flags,
					   unused_dev_name, mountopts);
316 317 318
		put_filesystem(cgroup_fs);
	}
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
319 320 321 322
}

static struct file_system_type cpuset_fs_type = {
	.name = "cpuset",
Al Viro's avatar
Al Viro committed
323
	.mount = cpuset_mount,
Linus Torvalds's avatar
Linus Torvalds committed
324 325 326
};

/*
327
 * Return in pmask the portion of a cpusets's cpus_allowed that
Linus Torvalds's avatar
Linus Torvalds committed
328
 * are online.  If none are online, walk up the cpuset hierarchy
329
 * until we find one that does have some online cpus.
Linus Torvalds's avatar
Linus Torvalds committed
330 331
 *
 * One way or another, we guarantee to return some non-empty subset
332
 * of cpu_online_mask.
Linus Torvalds's avatar
Linus Torvalds committed
333
 *
334
 * Call with callback_lock or cpuset_mutex held.
Linus Torvalds's avatar
Linus Torvalds committed
335
 */
336
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
Linus Torvalds's avatar
Linus Torvalds committed
337
{
338
	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
339
		cs = parent_cs(cs);
340 341 342 343 344 345 346 347 348 349 350 351
		if (unlikely(!cs)) {
			/*
			 * The top cpuset doesn't have any online cpu as a
			 * consequence of a race between cpuset_hotplug_work
			 * and cpu hotplug notifier.  But we know the top
			 * cpuset's effective_cpus is on its way to to be
			 * identical to cpu_online_mask.
			 */
			cpumask_copy(pmask, cpu_online_mask);
			return;
		}
	}
352
	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
Linus Torvalds's avatar
Linus Torvalds committed
353 354 355 356
}

/*
 * Return in *pmask the portion of a cpusets's mems_allowed that
357 358
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
359
 * online mems.  The top cpuset always has some mems online.
Linus Torvalds's avatar
Linus Torvalds committed
360 361
 *
 * One way or another, we guarantee to return some non-empty subset
362
 * of node_states[N_MEMORY].
Linus Torvalds's avatar
Linus Torvalds committed
363
 *
364
 * Call with callback_lock or cpuset_mutex held.
Linus Torvalds's avatar
Linus Torvalds committed
365
 */
366
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
Linus Torvalds's avatar
Linus Torvalds committed
367
{
368
	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
369
		cs = parent_cs(cs);
370
	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
Linus Torvalds's avatar
Linus Torvalds committed
371 372
}

373 374 375
/*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
376
 * Call with callback_lock or cpuset_mutex held.
377 378 379 380 381
 */
static void cpuset_update_task_spread_flag(struct cpuset *cs,
					struct task_struct *tsk)
{
	if (is_spread_page(cs))
382
		task_set_spread_page(tsk);
383
	else
384 385
		task_clear_spread_page(tsk);

386
	if (is_spread_slab(cs))
387
		task_set_spread_slab(tsk);
388
	else
389
		task_clear_spread_slab(tsk);
390 391
}

Linus Torvalds's avatar
Linus Torvalds committed
392 393 394 395 396
/*
 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
397
 * are only set if the other's are set.  Call holding cpuset_mutex.
Linus Torvalds's avatar
Linus Torvalds committed
398 399 400 401
 */

static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
402
	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
Linus Torvalds's avatar
Linus Torvalds committed
403 404 405 406 407
		nodes_subset(p->mems_allowed, q->mems_allowed) &&
		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
		is_mem_exclusive(p) <= is_mem_exclusive(q);
}

408 409 410 411
/**
 * alloc_trial_cpuset - allocate a trial cpuset
 * @cs: the cpuset that the trial cpuset duplicates
 */
412
static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
413
{
414 415 416 417 418 419
	struct cpuset *trial;

	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
	if (!trial)
		return NULL;

420 421 422 423
	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
		goto free_cs;
	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
		goto free_cpus;
424

425 426
	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
427
	return trial;
428 429 430 431 432 433

free_cpus:
	free_cpumask_var(trial->cpus_allowed);
free_cs:
	kfree(trial);
	return NULL;
434 435 436 437 438 439 440 441
}

/**
 * free_trial_cpuset - free the trial cpuset
 * @trial: the trial cpuset to be freed
 */
static void free_trial_cpuset(struct cpuset *trial)
{
442
	free_cpumask_var(trial->effective_cpus);
443
	free_cpumask_var(trial->cpus_allowed);
444 445 446
	kfree(trial);
}

Linus Torvalds's avatar
Linus Torvalds committed
447 448 449 450 451 452 453
/*
 * validate_change() - Used to validate that any proposed cpuset change
 *		       follows the structural rules for cpusets.
 *
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
454
 * cpuset_mutex held.
Linus Torvalds's avatar
Linus Torvalds committed
455 456 457 458 459 460 461 462 463 464 465 466
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
 * cpuset in the list must use cur below, not trial.
 *
 * 'trial' is the address of bulk structure copy of cur, with
 * perhaps one or more of the fields cpus_allowed, mems_allowed,
 * or flags changed to new, trial values.
 *
 * Return 0 if valid, -errno if not.
 */

467
static int validate_change(struct cpuset *cur, struct cpuset *trial)
Linus Torvalds's avatar
Linus Torvalds committed
468
{
469
	struct cgroup_subsys_state *css;
Linus Torvalds's avatar
Linus Torvalds committed
470
	struct cpuset *c, *par;
471 472 473
	int ret;

	rcu_read_lock();
Linus Torvalds's avatar
Linus Torvalds committed
474 475

	/* Each of our child cpusets must be a subset of us */
476
	ret = -EBUSY;
477
	cpuset_for_each_child(c, css, cur)
478 479
		if (!is_cpuset_subset(c, trial))
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
480 481

	/* Remaining checks don't apply to root cpuset */
482
	ret = 0;
483
	if (cur == &top_cpuset)
484
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
485

486
	par = parent_cs(cur);
487

488
	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
489
	ret = -EACCES;
490 491
	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
	    !is_cpuset_subset(trial, par))
492
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
493

494 495 496 497
	/*
	 * If either I or some sibling (!= me) is exclusive, we can't
	 * overlap
	 */
498
	ret = -EINVAL;
499
	cpuset_for_each_child(c, css, par) {
Linus Torvalds's avatar
Linus Torvalds committed
500 501
		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
		    c != cur &&
502
		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
503
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
504 505 506
		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
		    c != cur &&
		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
507
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
508 509
	}

510 511
	/*
	 * Cpusets with tasks - existing or newly being attached - can't
512
	 * be changed to have empty cpus_allowed or mems_allowed.
513
	 */
514
	ret = -ENOSPC;
515
	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
516 517 518 519 520 521 522
		if (!cpumask_empty(cur->cpus_allowed) &&
		    cpumask_empty(trial->cpus_allowed))
			goto out;
		if (!nodes_empty(cur->mems_allowed) &&
		    nodes_empty(trial->mems_allowed))
			goto out;
	}
523

524 525 526 527 528 529 530 531 532 533
	/*
	 * We can't shrink if we won't have enough room for SCHED_DEADLINE
	 * tasks.
	 */
	ret = -EBUSY;
	if (is_cpu_exclusive(cur) &&
	    !cpuset_cpumask_can_shrink(cur->cpus_allowed,
				       trial->cpus_allowed))
		goto out;

534 535 536 537
	ret = 0;
out:
	rcu_read_unlock();
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
538 539
}

540
#ifdef CONFIG_SMP
541
/*
542
 * Helper routine for generate_sched_domains().
543
 * Do cpusets a, b have overlapping effective cpus_allowed masks?
544 545 546
 */
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
547
	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
548 549
}

550 551 552 553 554 555 556 557
static void
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
	if (dattr->relax_domain_level < c->relax_domain_level)
		dattr->relax_domain_level = c->relax_domain_level;
	return;
}

558 559
static void update_domain_attr_tree(struct sched_domain_attr *dattr,
				    struct cpuset *root_cs)
560
{
561
	struct cpuset *cp;
562
	struct cgroup_subsys_state *pos_css;
563

564
	rcu_read_lock();
565
	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
566 567
		/* skip the whole subtree if @cp doesn't have any CPU */
		if (cpumask_empty(cp->cpus_allowed)) {
568
			pos_css = css_rightmost_descendant(pos_css);
569
			continue;
570
		}
571 572 573 574

		if (is_sched_load_balance(cp))
			update_domain_attr(dattr, cp);
	}
575
	rcu_read_unlock();
576 577
}

578
/*
579 580 581 582 583
 * generate_sched_domains()
 *
 * This function builds a partial partition of the systems CPUs
 * A 'partial partition' is a set of non-overlapping subsets whose
 * union is a subset of that set.
584
 * The output of this function needs to be passed to kernel/sched/core.c
585 586 587
 * partition_sched_domains() routine, which will rebuild the scheduler's
 * load balancing domains (sched domains) as specified by that partial
 * partition.
588
 *
589
 * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
590 591 592 593 594 595 596
 * for a background explanation of this.
 *
 * Does not return errors, on the theory that the callers of this
 * routine would rather not worry about failures to rebuild sched
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
597
 * Must be called with cpuset_mutex held.
598 599
 *
 * The three key local variables below are:
600
 *    q  - a linked-list queue of cpuset pointers, used to implement a
601 602 603 604 605 606 607 608 609 610 611 612
 *	   top-down scan of all cpusets.  This scan loads a pointer
 *	   to each cpuset marked is_sched_load_balance into the
 *	   array 'csa'.  For our purposes, rebuilding the schedulers
 *	   sched domains, we can ignore !is_sched_load_balance cpusets.
 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 *	   that need to be load balanced, for convenient iterative
 *	   access by the subsequent code that finds the best partition,
 *	   i.e the set of domains (subsets) of CPUs such that the
 *	   cpus_allowed of every cpuset marked is_sched_load_balance
 *	   is a subset of one of these domains, while there are as
 *	   many such domains as possible, each as small as possible.
 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
613
 *	   the kernel/sched/core.c routine partition_sched_domains() in a
614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
 *	   convenient format, that can be easily compared to the prior
 *	   value to determine what partition elements (sched domains)
 *	   were changed (added or removed.)
 *
 * Finding the best partition (set of domains):
 *	The triple nested loops below over i, j, k scan over the
 *	load balanced cpusets (using the array of cpuset pointers in
 *	csa[]) looking for pairs of cpusets that have overlapping
 *	cpus_allowed, but which don't have the same 'pn' partition
 *	number and gives them in the same partition number.  It keeps
 *	looping on the 'restart' label until it can no longer find
 *	any such pairs.
 *
 *	The union of the cpus_allowed masks from the set of
 *	all cpusets having the same 'pn' value then form the one
 *	element of the partition (one sched domain) to be passed to
 *	partition_sched_domains().
 */
632
static int generate_sched_domains(cpumask_var_t **domains,
633
			struct sched_domain_attr **attributes)
634 635 636 637 638
{
	struct cpuset *cp;	/* scans q */
	struct cpuset **csa;	/* array of all cpuset ptrs */
	int csn;		/* how many cpuset ptrs in csa so far */
	int i, j, k;		/* indices for partition finding loops */
639
	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
640
	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
641
	struct sched_domain_attr *dattr;  /* attributes for custom domains */
642
	int ndoms = 0;		/* number of sched domains in result */
643
	int nslot;		/* next empty doms[] struct cpumask slot */
644
	struct cgroup_subsys_state *pos_css;
645 646

	doms = NULL;
647
	dattr = NULL;
648
	csa = NULL;
649

650 651 652 653
	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
		goto done;
	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

654 655
	/* Special case for the 99% of systems with one, full, sched domain */
	if (is_sched_load_balance(&top_cpuset)) {
656 657
		ndoms = 1;
		doms = alloc_sched_domains(ndoms);
658
		if (!doms)
659 660
			goto done;

661 662 663
		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
		if (dattr) {
			*dattr = SD_ATTR_INIT;
664
			update_domain_attr_tree(dattr, &top_cpuset);
665
		}
666 667
		cpumask_and(doms[0], top_cpuset.effective_cpus,
				     non_isolated_cpus);
668 669

		goto done;
670 671
	}

672
	csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
673 674 675 676
	if (!csa)
		goto done;
	csn = 0;

677
	rcu_read_lock();
678
	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
679 680
		if (cp == &top_cpuset)
			continue;
681
		/*
682 683 684 685 686 687
		 * Continue traversing beyond @cp iff @cp has some CPUs and
		 * isn't load balancing.  The former is obvious.  The
		 * latter: All child cpusets contain a subset of the
		 * parent's cpus, so just skip them, and then we call
		 * update_domain_attr_tree() to calc relax_domain_level of
		 * the corresponding sched domain.
688
		 */
689
		if (!cpumask_empty(cp->cpus_allowed) &&
690 691
		    !(is_sched_load_balance(cp) &&
		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
692
			continue;
693

694 695 696 697
		if (is_sched_load_balance(cp))
			csa[csn++] = cp;

		/* skip @cp's subtree */
698
		pos_css = css_rightmost_descendant(pos_css);
699 700
	}
	rcu_read_unlock();
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728

	for (i = 0; i < csn; i++)
		csa[i]->pn = i;
	ndoms = csn;

restart:
	/* Find the best partition (set of sched domains) */
	for (i = 0; i < csn; i++) {
		struct cpuset *a = csa[i];
		int apn = a->pn;

		for (j = 0; j < csn; j++) {
			struct cpuset *b = csa[j];
			int bpn = b->pn;

			if (apn != bpn && cpusets_overlap(a, b)) {
				for (k = 0; k < csn; k++) {
					struct cpuset *c = csa[k];

					if (c->pn == bpn)
						c->pn = apn;
				}
				ndoms--;	/* one less element */
				goto restart;
			}
		}
	}

729 730 731 732
	/*
	 * Now we know how many domains to create.
	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
	 */
733
	doms = alloc_sched_domains(ndoms);
734
	if (!doms)
735 736 737 738 739 740
		goto done;

	/*
	 * The rest of the code, including the scheduler, can deal with
	 * dattr==NULL case. No need to abort if alloc fails.
	 */
741
	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
742 743 744

	for (nslot = 0, i = 0; i < csn; i++) {
		struct cpuset *a = csa[i];
745
		struct cpumask *dp;
746 747
		int apn = a->pn;

748 749 750 751 752
		if (apn < 0) {
			/* Skip completed partitions */
			continue;
		}

753
		dp = doms[nslot];
754 755 756 757

		if (nslot == ndoms) {
			static int warnings = 10;
			if (warnings) {
758 759
				pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
					nslot, ndoms, csn, i, apn);
760
				warnings--;
761
			}
762 763
			continue;
		}
764

765
		cpumask_clear(dp);
766 767 768 769 770 771
		if (dattr)
			*(dattr + nslot) = SD_ATTR_INIT;
		for (j = i; j < csn; j++) {
			struct cpuset *b = csa[j];

			if (apn == b->pn) {
772
				cpumask_or(dp, dp, b->effective_cpus);
773
				cpumask_and(dp, dp, non_isolated_cpus);
774 775 776 777 778
				if (dattr)
					update_domain_attr_tree(dattr + nslot, b);

				/* Done with this partition */
				b->pn = -1;
779 780
			}
		}
781
		nslot++;
782 783 784
	}
	BUG_ON(nslot != ndoms);

785
done:
786
	free_cpumask_var(non_isolated_cpus);
787 788
	kfree(csa);

789 790 791 792 793 794 795
	/*
	 * Fallback to the default domain if kmalloc() failed.
	 * See comments in partition_sched_domains().
	 */
	if (doms == NULL)
		ndoms = 1;

796 797 798 799 800 801 802 803
	*domains    = doms;
	*attributes = dattr;
	return ndoms;
}

/*
 * Rebuild scheduler domains.
 *
804 805 806 807 808
 * If the flag 'sched_load_balance' of any cpuset with non-empty
 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
 * which has that flag enabled, or if any cpuset with a non-empty
 * 'cpus' is removed, then call this routine to rebuild the
 * scheduler's dynamic sched domains.
809
 *
810
 * Call with cpuset_mutex held.  Takes get_online_cpus().
811
 */
812
static void rebuild_sched_domains_locked(void)
813 814
{
	struct sched_domain_attr *attr;
815
	cpumask_var_t *doms;
816 817
	int ndoms;

818
	lockdep_assert_held(&cpuset_mutex);
819
	get_online_cpus();
820

821 822 823 824 825
	/*
	 * We have raced with CPU hotplug. Don't do anything to avoid
	 * passing doms with offlined cpu to partition_sched_domains().
	 * Anyways, hotplug work item will rebuild sched domains.
	 */
826
	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
827 828
		goto out;

829 830 831 832 833
	/* Generate domain masks and attrs */
	ndoms = generate_sched_domains(&doms, &attr);

	/* Have scheduler rebuild the domains */
	partition_sched_domains(ndoms, doms, attr);
834
out:
835
	put_online_cpus();
836
}
837
#else /* !CONFIG_SMP */
838
static void rebuild_sched_domains_locked(void)
839 840 841
{
}
#endif /* CONFIG_SMP */
842

843 844
void rebuild_sched_domains(void)
{
845
	mutex_lock(&cpuset_mutex);
846
	rebuild_sched_domains_locked();
847
	mutex_unlock(&cpuset_mutex);
848 849
}

850 851 852 853
/**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 *
854 855 856
 * Iterate through each task of @cs updating its cpus_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
857
 */
858
static void update_tasks_cpumask(struct cpuset *cs)
859
{
860 861 862 863 864
	struct css_task_iter it;
	struct task_struct *task;

	css_task_iter_start(&cs->css, &it);
	while ((task = css_task_iter_next(&it)))
865
		set_cpus_allowed_ptr(task, cs->effective_cpus);
866
	css_task_iter_end(&it);
867 868
}

869
/*
870 871 872 873 874 875
 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
 * @cs: the cpuset to consider
 * @new_cpus: temp variable for calculating new effective_cpus
 *
 * When congifured cpumask is changed, the effective cpumasks of this cpuset
 * and all its descendants need to be updated.
876
 *
877
 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
878 879 880
 *
 * Called with cpuset_mutex held
 */
881
static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
882 883
{
	struct cpuset *cp;
884
	struct cgroup_subsys_state *pos_css;
885
	bool need_rebuild_sched_domains = false;
886 887

	rcu_read_lock();
888 889 890 891 892
	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
		struct cpuset *parent = parent_cs(cp);

		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);

893 894 895 896
		/*
		 * If it becomes empty, inherit the effective mask of the
		 * parent, which is guaranteed to have some CPUs.
		 */
897 898
		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
		    cpumask_empty(new_cpus))
899 900
			cpumask_copy(new_cpus, parent->effective_cpus);

901 902 903 904
		/* Skip the whole subtree if the cpumask remains the same. */
		if (cpumask_equal(new_cpus, cp->effective_cpus)) {
			pos_css = css_rightmost_descendant(pos_css);
			continue;
905
		}
906

907
		if (!css_tryget_online(&cp->css))
908 909 910
			continue;
		rcu_read_unlock();

911
		spin_lock_irq(&callback_lock);
912
		cpumask_copy(cp->effective_cpus, new_cpus);
913
		spin_unlock_irq(&callback_lock);
914

915
		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
916 917
			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));

918
		update_tasks_cpumask(cp);
919

920 921 922 923 924 925 926 927
		/*
		 * If the effective cpumask of any non-empty cpuset is changed,
		 * we need to rebuild sched domains.
		 */
		if (!cpumask_empty(cp->cpus_allowed) &&
		    is_sched_load_balance(cp))
			need_rebuild_sched_domains = true;

928 929 930 931
		rcu_read_lock();
		css_put(&cp->css);
	}
	rcu_read_unlock();
932 933 934

	if (need_rebuild_sched_domains)
		rebuild_sched_domains_locked();
935 936
}

937 938 939
/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
940
 * @trialcs: trial cpuset
941 942
 * @buf: buffer of cpu numbers written to this cpuset
 */
943 944
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
			  const char *buf)
Linus Torvalds's avatar
Linus Torvalds committed
945
{
946
	int retval;
Linus Torvalds's avatar
Linus Torvalds committed
947

948
	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
949 950 951
	if (cs == &top_cpuset)
		return -EACCES;

952
	/*
953
	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
954 955 956
	 * Since cpulist_parse() fails on an empty mask, we special case
	 * that parsing.  The validate_change() call ensures that cpusets
	 * with tasks have cpus.
957
	 */
958
	if (!*buf) {
959
		cpumask_clear(trialcs->cpus_allowed);
960
	} else {
961
		retval = cpulist_parse(buf, trialcs->cpus_allowed);
962 963
		if (retval < 0)
			return retval;
964

965 966
		if (!cpumask_subset(trialcs->cpus_allowed,
				    top_cpuset.cpus_allowed))
967
			return -EINVAL;
968
	}
969

Paul Menage's avatar
Paul Menage committed
970
	/* Nothing to do if the cpus didn't change */
971
	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
Paul Menage's avatar
Paul Menage committed
972
		return 0;
973

974 975 976 977
	retval = validate_change(cs, trialcs);
	if (retval < 0)
		return retval;

978
	spin_lock_irq(&callback_lock);
979
	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
980
	spin_unlock_irq(&callback_lock);
981

982 983
	/* use trialcs->cpus_allowed as a temp variable */
	update_cpumasks_hier(cs, trialcs->cpus_allowed);
984
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
985 986
}

987
/*
988 989 990 991 992
 * Migrate memory region from one set of nodes to another.  This is
 * performed asynchronously as it can be called from process migration path
 * holding locks involved in process management.  All mm migrations are
 * performed in the queued order and can be waited for by flushing
 * cpuset_migrate_mm_wq.
993 994
 */

995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
struct cpuset_migrate_mm_work {
	struct work_struct	work;
	struct mm_struct	*mm;
	nodemask_t		from;
	nodemask_t		to;
};

static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
	struct cpuset_migrate_mm_work *mwork =
		container_of(work, struct cpuset_migrate_mm_work, work);

	/* on a wq worker, no need to worry about %current's mems_allowed */
	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
	mmput(mwork->mm);
	kfree(mwork);
}

1013 1014 1015
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
							const nodemask_t *to)
{
1016
	struct cpuset_migrate_mm_work *mwork;
1017

1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
	if (mwork) {
		mwork->mm = mm;
		mwork->from = *from;
		mwork->to = *to;
		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
		queue_work(cpuset_migrate_mm_wq, &mwork->work);
	} else {
		mmput(mm);
	}
}
1029

1030
static void cpuset_post_attach(void)
1031 1032
{
	flush_workqueue(cpuset_migrate_mm_wq);
1033 1034
}

1035
/*
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
 * @tsk: the task to change
 * @newmems: new nodes that the task will be set
 *
 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
 * we structure updates as setting all new allowed nodes, then clearing newly
 * disallowed ones.
 */
static void cpuset_change_task_nodemask(struct task_struct *tsk,
					nodemask_t *newmems)
{
1047
	bool need_loop;
1048

1049
	task_lock(tsk);
1050 1051
	/*
	 * Determine if a loop is necessary if another thread is doing
1052
	 * read_mems_allowed_begin().  If at least one node remains unchanged and
1053 1054 1055 1056 1057
	 * tsk does not have a mempolicy, then an empty nodemask will not be
	 * possible when mems_allowed is larger than a word.
	 */
	need_loop = task_has_mempolicy(tsk) ||
			!nodes_intersects(*newmems, tsk->mems_allowed);
1058

1059 1060
	if (need_loop) {
		local_irq_disable();
1061
		write_seqcount_begin(&tsk->mems_allowed_seq);
1062
	}
1063

1064 1065
	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1066 1067

	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1068
	tsk->mems_allowed = *newmems;
1069

1070
	if (need_loop) {
1071
		write_seqcount_end(&tsk->mems_allowed_seq);
1072 1073
		local_irq_enable();
	}
1074

1075
	task_unlock(tsk);
1076 1077
}

1078 1079
static void *cpuset_being_rebound;

1080 1081 1082 1083
/**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
 *
1084 1085 1086
 * Iterate through each task of @cs updating its mems_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
1087
 */
1088
static void update_tasks_nodemask(struct cpuset *cs)
Linus Torvalds's avatar
Linus Torvalds committed
1089
{
1090
	static nodemask_t newmems;	/* protected by cpuset_mutex */
1091 1092
	struct css_task_iter it;
	struct task_struct *task;
1093

1094
	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
1095

1096
	guarantee_online_mems(cs, &newmems);
1097

1098
	/*
1099 1100 1101 1102
	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
	 * take while holding tasklist_lock.  Forks can happen - the
	 * mpol_dup() cpuset_being_rebound check will catch such forks,
	 * and rebind their vma mempolicies too.  Because we still hold
1103
	 * the global cpuset_mutex, we know that no other rebind effort
1104
	 * will be contending for the global variable cpuset_being_rebound.
1105
	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1106
	 * is idempotent.  Also migrate pages in each mm to new nodes.
1107
	 */
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123
	css_task_iter_start(&cs->css, &it);
	while ((task = css_task_iter_next(&it))) {
		struct mm_struct *mm;
		bool migrate;

		cpuset_change_task_nodemask(task, &newmems);

		mm = get_task_mm(task);
		if (!mm)
			continue;

		migrate = is_memory_migrate(cs);

		mpol_rebind_mm(mm, &cs->mems_allowed);
		if (migrate)
			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1124 1125
		else
			mmput(mm);
1126 1127
	}
	css_task_iter_end(&it);
1128

1129 1130 1131 1132 1133 1134
	/*
	 * All the tasks' nodemasks have been updated, update
	 * cs->old_mems_allowed.
	 */
	cs->old_mems_allowed = newmems;

1135
	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
1136
	cpuset_being_rebound = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1137 1138
}

1139
/*
1140 1141 1142
 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
 * @cs: the cpuset to consider
 * @new_mems: a temp variable for calculating new effective_mems
1143
 *
1144 1145
 * When configured nodemask is changed, the effective nodemasks of this cpuset
 * and all its descendants need to be updated.
1146
 *
1147
 * On legacy hiearchy, effective_mems will be the same with mems_allowed.
1148 1149 1150
 *
 * Called with cpuset_mutex held
 */
1151
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1152 1153
{
	struct cpuset *cp;
1154
	struct cgroup_subsys_state *pos_css;
1155 1156

	rcu_read_lock();
1157 1158 1159 1160 1161
	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
		struct cpuset *parent = parent_cs(cp);

		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);

1162 1163 1164 1165
		/*
		 * If it becomes empty, inherit the effective mask of the
		 * parent, which is guaranteed to have some MEMs.
		 */
1166 1167
		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
		    nodes_empty(*new_mems))
1168 1169
			*new_mems = parent->effective_mems;

1170 1171 1172 1173
		/* Skip the whole subtree if the nodemask remains the same. */
		if (nodes_equal(*new_mems, cp->effective_mems)) {
			pos_css = css_rightmost_descendant(pos_css);
			continue;
1174
		}
1175

1176
		if (!css_tryget_online(&cp->css))
1177 1178 1179
			continue;
		rcu_read_unlock();

1180
		spin_lock_irq(&callback_lock);
1181
		cp->effective_mems = *new_mems;
1182
		spin_unlock_irq(&callback_lock);
1183

1184
		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1185
			!nodes_equal(cp->mems_allowed, cp->effective_mems));
1186

1187
		update_tasks_nodemask(cp);
1188 1189 1190 1191 1192 1193 1194

		rcu_read_lock();
		css_put(&cp->css);
	}
	rcu_read_unlock();
}

1195 1196 1197
/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
1198 1199 1200 1201
 * cpusets mems_allowed, and for each task in the cpuset,
 * update mems_allowed and rebind task's mempolicy and any vma
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
1202
 *
1203
 * Call with cpuset_mutex held. May take callback_lock during call.
1204 1205 1206 1207
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
 */
1208 1209
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
			   const char *buf)
1210 1211 1212 1213
{
	int retval;

	/*
1214
	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1215 1216
	 * it's read-only
	 */
1217 1218 1219 1220
	if (cs == &top_cpuset) {
		retval = -EACCES;
		goto done;
	}
1221 1222 1223 1224 1225 1226 1227 1228

	/*
	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
	 * Since nodelist_parse() fails on an empty mask, we special case
	 * that parsing.  The validate_change() call ensures that cpusets
	 * with tasks have memory.
	 */
	if (!*buf) {
1229
		nodes_clear(trialcs->mems_allowed);
1230
	} else {
1231
		retval = nodelist_parse(buf, trialcs->mems_allowed);
1232 1233 1234
		if (retval < 0)
			goto done;

1235
		if (!nodes_subset(trialcs->mems_allowed,
1236 1237
				  top_cpuset.mems_allowed)) {
			retval = -EINVAL;
1238 1239
			goto done;
		}
1240
	}
1241 1242

	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1243 1244 1245
		retval = 0;		/* Too easy - nothing to do */
		goto done;
	}
1246
	retval = validate_change(cs, trialcs);
1247 1248 1249
	if (retval < 0)
		goto done;

1250
	spin_lock_irq(&callback_lock);
1251
	cs->mems_allowed = trialcs->mems_allowed;
1252
	spin_unlock_irq(&callback_lock);
1253

1254
	/* use trialcs->mems_allowed as a temp variable */
1255
	update_nodemasks_hier(cs, &trialcs->mems_allowed);
1256 1257 1258 1259
done:
	return retval;
}

1260 1261
int current_cpuset_is_being_rebound(void)
{
1262 1263 1264 1265 1266 1267 1268
	int ret;

	rcu_read_lock();
	ret = task_cs(current) == cpuset_being_rebound;
	rcu_read_unlock();

	return ret;
1269 1270
}