memcontrol.c 159 KB
Newer Older
1 2 3 4 5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6 7 8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9 10 11 12
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
13 14 15 16
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
17 18 19 20 21 22
 * Native page reclaim
 * Charge lifetime sanitation
 * Lockless page tracking & accounting
 * Unified hierarchy configuration model
 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 *
23 24 25 26 27 28 29 30 31 32 33
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

34
#include <linux/page_counter.h>
35 36
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
37
#include <linux/mm.h>
38
#include <linux/sched/mm.h>
39
#include <linux/shmem_fs.h>
40
#include <linux/hugetlb.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
41
#include <linux/pagemap.h>
42
#include <linux/smp.h>
43
#include <linux/page-flags.h>
44
#include <linux/backing-dev.h>
45 46
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
47
#include <linux/limits.h>
48
#include <linux/export.h>
49
#include <linux/mutex.h>
50
#include <linux/rbtree.h>
51
#include <linux/slab.h>
52
#include <linux/swap.h>
53
#include <linux/swapops.h>
54
#include <linux/spinlock.h>
55
#include <linux/eventfd.h>
56
#include <linux/poll.h>
57
#include <linux/sort.h>
58
#include <linux/fs.h>
59
#include <linux/seq_file.h>
60
#include <linux/vmpressure.h>
61
#include <linux/mm_inline.h>
62
#include <linux/swap_cgroup.h>
63
#include <linux/cpu.h>
64
#include <linux/oom.h>
65
#include <linux/lockdep.h>
66
#include <linux/file.h>
67
#include <linux/tracehook.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
68
#include "internal.h"
Glauber Costa's avatar
Glauber Costa committed
69
#include <net/sock.h>
Michal Hocko's avatar
Michal Hocko committed
70
#include <net/ip.h>
71
#include "slab.h"
72

73
#include <linux/uaccess.h>
74

75 76
#include <trace/events/vmscan.h>

77 78
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
79

80 81
struct mem_cgroup *root_mem_cgroup __read_mostly;

82
#define MEM_CGROUP_RECLAIM_RETRIES	5
83

84 85 86
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;

87 88 89
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem;

90
/* Whether the swap controller is active */
Andrew Morton's avatar
Andrew Morton committed
91
#ifdef CONFIG_MEMCG_SWAP
92 93
int do_swap_account __read_mostly;
#else
94
#define do_swap_account		0
95 96
#endif

97 98 99 100 101 102
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
}

103
static const char *const mem_cgroup_lru_names[] = {
104 105 106 107 108 109 110
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

111 112 113
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024
114

115 116 117 118 119
/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

120
struct mem_cgroup_tree_per_node {
121
	struct rb_root rb_root;
122
	struct rb_node *rb_rightmost;
123 124 125 126 127 128 129 130 131
	spinlock_t lock;
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
132 133 134 135 136
/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};
137

138 139 140
/*
 * cgroup_event represents events which userspace want to receive.
 */
141
struct mem_cgroup_event {
142
	/*
143
	 * memcg which the event belongs to.
144
	 */
145
	struct mem_cgroup *memcg;
146 147 148 149 150 151 152 153
	/*
	 * eventfd to signal userspace about the event.
	 */
	struct eventfd_ctx *eventfd;
	/*
	 * Each of these stored in a list by the cgroup.
	 */
	struct list_head list;
154 155 156 157 158
	/*
	 * register_event() callback will be used to add new userspace
	 * waiter for changes related to this event.  Use eventfd_signal()
	 * on eventfd to send notification to userspace.
	 */
159
	int (*register_event)(struct mem_cgroup *memcg,
Tejun Heo's avatar
Tejun Heo committed
160
			      struct eventfd_ctx *eventfd, const char *args);
161 162 163 164 165
	/*
	 * unregister_event() callback will be called when userspace closes
	 * the eventfd or on cgroup removing.  This callback must be set,
	 * if you want provide notification functionality.
	 */
166
	void (*unregister_event)(struct mem_cgroup *memcg,
167
				 struct eventfd_ctx *eventfd);
168 169 170 171 172 173
	/*
	 * All fields below needed to unregister event when
	 * userspace closes eventfd.
	 */
	poll_table pt;
	wait_queue_head_t *wqh;
174
	wait_queue_entry_t wait;
175 176 177
	struct work_struct remove;
};

178 179
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
180

181 182
/* Stuffs for move charges at task migration. */
/*
183
 * Types of charges to be moved.
184
 */
185 186 187
#define MOVE_ANON	0x1U
#define MOVE_FILE	0x2U
#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
188

189 190
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
191
	spinlock_t	  lock; /* for from, to */
192
	struct mm_struct  *mm;
193 194
	struct mem_cgroup *from;
	struct mem_cgroup *to;
195
	unsigned long flags;
196
	unsigned long precharge;
197
	unsigned long moved_charge;
198
	unsigned long moved_swap;
199 200 201
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
202
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
203 204
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
205

206 207 208 209
/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
210
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
211
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
212

213 214
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215
	MEM_CGROUP_CHARGE_TYPE_ANON,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
216
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
217
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
218 219 220
	NR_CHARGE_TYPE,
};

221
/* for encoding cft->private value on file */
222 223 224 225
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
226
	_KMEM,
Vladimir Davydov's avatar
Vladimir Davydov committed
227
	_TCP,
228 229
};

230 231
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
232
#define MEMFILE_ATTR(val)	((val) & 0xffff)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
233 234
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)
235

236 237 238 239 240 241 242 243 244 245 246 247 248
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

249 250 251 252 253
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
	return (memcg == root_mem_cgroup);
}

254
#ifndef CONFIG_SLOB
255
/*
256
 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
Li Zefan's avatar
Li Zefan committed
257 258 259 260 261
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
262
 *
263 264
 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 * will double each time we have to increase it.
265
 */
266 267
static DEFINE_IDA(memcg_cache_ida);
int memcg_nr_cache_ids;
268

269 270 271 272 273 274 275 276 277 278 279 280 281
/* Protects memcg_nr_cache_ids */
static DECLARE_RWSEM(memcg_cache_ids_sem);

void memcg_get_cache_ids(void)
{
	down_read(&memcg_cache_ids_sem);
}

void memcg_put_cache_ids(void)
{
	up_read(&memcg_cache_ids_sem);
}

282 283 284 285 286 287
/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
Li Zefan's avatar
Li Zefan committed
288
 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
289 290
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
Li Zefan's avatar
Li Zefan committed
291
 * cgrp_id space is not getting any smaller, and we don't have to necessarily
292 293 294
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
Li Zefan's avatar
Li Zefan committed
295
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
296

297 298 299 300 301 302
/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
303
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
304
EXPORT_SYMBOL(memcg_kmem_enabled_key);
305

306 307
struct workqueue_struct *memcg_kmem_cache_wq;

308
#endif /* !CONFIG_SLOB */
309

310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
/**
 * mem_cgroup_css_from_page - css of the memcg associated with a page
 * @page: page of interest
 *
 * If memcg is bound to the default hierarchy, css of the memcg associated
 * with @page is returned.  The returned css remains associated with @page
 * until it is released.
 *
 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 * is returned.
 */
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
	struct mem_cgroup *memcg;

	memcg = page->mem_cgroup;

327
	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
328 329 330 331 332
		memcg = root_mem_cgroup;

	return &memcg->css;
}

333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
/**
 * page_cgroup_ino - return inode number of the memcg a page is charged to
 * @page: the page
 *
 * Look up the closest online ancestor of the memory cgroup @page is charged to
 * and return its inode number or 0 if @page is not charged to any cgroup. It
 * is safe to call this function without holding a reference to @page.
 *
 * Note, this function is inherently racy, because there is nothing to prevent
 * the cgroup inode from getting torn down and potentially reallocated a moment
 * after page_cgroup_ino() returns, so it only should be used by callers that
 * do not care (such as procfs interfaces).
 */
ino_t page_cgroup_ino(struct page *page)
{
	struct mem_cgroup *memcg;
	unsigned long ino = 0;

	rcu_read_lock();
	memcg = READ_ONCE(page->mem_cgroup);
	while (memcg && !(memcg->css.flags & CSS_ONLINE))
		memcg = parent_mem_cgroup(memcg);
	if (memcg)
		ino = cgroup_ino(memcg->css.cgroup);
	rcu_read_unlock();
	return ino;
}

361 362
static struct mem_cgroup_per_node *
mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
363
{
364
	int nid = page_to_nid(page);
365

366
	return memcg->nodeinfo[nid];
367 368
}

369 370
static struct mem_cgroup_tree_per_node *
soft_limit_tree_node(int nid)
371
{
372
	return soft_limit_tree.rb_tree_per_node[nid];
373 374
}

375
static struct mem_cgroup_tree_per_node *
376 377 378 379
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);

380
	return soft_limit_tree.rb_tree_per_node[nid];
381 382
}

383 384
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz,
385
					 unsigned long new_usage_in_excess)
386 387 388
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
389
	struct mem_cgroup_per_node *mz_node;
390
	bool rightmost = true;
391 392 393 394 395 396 397 398 399

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
400
		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
401
					tree_node);
402
		if (mz->usage_in_excess < mz_node->usage_in_excess) {
403
			p = &(*p)->rb_left;
404 405 406
			rightmost = false;
		}

407 408 409 410 411 412 413
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
414 415 416 417

	if (rightmost)
		mctz->rb_rightmost = &mz->tree_node;

418 419 420 421 422
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

423 424
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz)
425 426 427
{
	if (!mz->on_tree)
		return;
428 429 430 431

	if (&mz->tree_node == mctz->rb_rightmost)
		mctz->rb_rightmost = rb_prev(&mz->tree_node);

432 433 434 435
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

436 437
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
				       struct mem_cgroup_tree_per_node *mctz)
438
{
439 440 441
	unsigned long flags;

	spin_lock_irqsave(&mctz->lock, flags);
442
	__mem_cgroup_remove_exceeded(mz, mctz);
443
	spin_unlock_irqrestore(&mctz->lock, flags);
444 445
}

446 447 448
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
	unsigned long nr_pages = page_counter_read(&memcg->memory);
449
	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
450 451 452 453 454 455 456
	unsigned long excess = 0;

	if (nr_pages > soft_limit)
		excess = nr_pages - soft_limit;

	return excess;
}
457 458 459

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
460
	unsigned long excess;
461 462
	struct mem_cgroup_per_node *mz;
	struct mem_cgroup_tree_per_node *mctz;
463

464
	mctz = soft_limit_tree_from_page(page);
465 466
	if (!mctz)
		return;
467 468 469 470 471
	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
472
		mz = mem_cgroup_page_nodeinfo(memcg, page);
473
		excess = soft_limit_excess(memcg);
474 475 476 477 478
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
479 480 481
			unsigned long flags;

			spin_lock_irqsave(&mctz->lock, flags);
482 483
			/* if on-tree, remove it */
			if (mz->on_tree)
484
				__mem_cgroup_remove_exceeded(mz, mctz);
485 486 487 488
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
489
			__mem_cgroup_insert_exceeded(mz, mctz, excess);
490
			spin_unlock_irqrestore(&mctz->lock, flags);
491 492 493 494 495 496
		}
	}
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
497 498 499
	struct mem_cgroup_tree_per_node *mctz;
	struct mem_cgroup_per_node *mz;
	int nid;
500

501
	for_each_node(nid) {
502 503
		mz = mem_cgroup_nodeinfo(memcg, nid);
		mctz = soft_limit_tree_node(nid);
504 505
		if (mctz)
			mem_cgroup_remove_exceeded(mz, mctz);
506 507 508
	}
}

509 510
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
511
{
512
	struct mem_cgroup_per_node *mz;
513 514 515

retry:
	mz = NULL;
516
	if (!mctz->rb_rightmost)
517 518
		goto done;		/* Nothing to reclaim from */

519 520
	mz = rb_entry(mctz->rb_rightmost,
		      struct mem_cgroup_per_node, tree_node);
521 522 523 524 525
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
526
	__mem_cgroup_remove_exceeded(mz, mctz);
527
	if (!soft_limit_excess(mz->memcg) ||
528
	    !css_tryget_online(&mz->memcg->css))
529 530 531 532 533
		goto retry;
done:
	return mz;
}

534 535
static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
536
{
537
	struct mem_cgroup_per_node *mz;
538

539
	spin_lock_irq(&mctz->lock);
540
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
541
	spin_unlock_irq(&mctz->lock);
542 543 544
	return mz;
}

545
/*
546 547
 * Return page count for single (non recursive) @memcg.
 *
548 549 550 551 552
 * Implementation Note: reading percpu statistics for memcg.
 *
 * Both of vmstat[] and percpu_counter has threshold and do periodic
 * synchronization to implement "quick" read. There are trade-off between
 * reading cost and precision of value. Then, we may have a chance to implement
553
 * a periodic synchronization of counter in memcg's counter.
554 555 556 557 558 559 560 561 562
 *
 * But this _read() function is used for user interface now. The user accounts
 * memory usage by memory cgroup and he _always_ requires exact value because
 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 * have to visit all online cpus and make sum. So, for now, unnecessary
 * synchronization is not implemented. (just implemented for cpu hotplug)
 *
 * If there are kernel internal actions which can make use of some not-exact
 * value, and reading all cpu value can be performance bottleneck in some
563
 * common workload, threshold and synchronization as vmstat[] should be
564
 * implemented.
565 566
 *
 * The parameter idx can be of type enum memcg_event_item or vm_event_item.
567
 */
568

569
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
570
				      int event)
571 572 573 574
{
	unsigned long val = 0;
	int cpu;

575
	for_each_possible_cpu(cpu)
576
		val += per_cpu(memcg->stat->events[event], cpu);
577 578 579
	return val;
}

580
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
581
					 struct page *page,
582
					 bool compound, int nr_pages)
583
{
584 585 586 587
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
588
	if (PageAnon(page))
589
		__this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
590
	else {
591
		__this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
592
		if (PageSwapBacked(page))
593
			__this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
594
	}
595

596 597
	if (compound) {
		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
598
		__this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
599
	}
600

601 602
	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
603
		__this_cpu_inc(memcg->stat->events[PGPGIN]);
604
	else {
605
		__this_cpu_inc(memcg->stat->events[PGPGOUT]);
606 607
		nr_pages = -nr_pages; /* for event */
	}
608

609
	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
610 611
}

612 613
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
					   int nid, unsigned int lru_mask)
614
{
615
	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
616
	unsigned long nr = 0;
617
	enum lru_list lru;
618

619
	VM_BUG_ON((unsigned)nid >= nr_node_ids);
620

621 622 623
	for_each_lru(lru) {
		if (!(BIT(lru) & lru_mask))
			continue;
624
		nr += mem_cgroup_get_lru_size(lruvec, lru);
625 626
	}
	return nr;
627
}
628

629
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
630
			unsigned int lru_mask)
631
{
632
	unsigned long nr = 0;
633
	int nid;
634

635
	for_each_node_state(nid, N_MEMORY)
636 637
		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
	return nr;
638 639
}

640 641
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
642 643 644
{
	unsigned long val, next;

645
	val = __this_cpu_read(memcg->stat->nr_page_events);
646
	next = __this_cpu_read(memcg->stat->targets[target]);
647
	/* from time_after() in jiffies.h */
648
	if ((long)(next - val) < 0) {
649 650 651 652
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
653 654 655
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;
			break;
656 657 658 659 660 661 662 663
		case MEM_CGROUP_TARGET_NUMAINFO:
			next = val + NUMAINFO_EVENTS_TARGET;
			break;
		default:
			break;
		}
		__this_cpu_write(memcg->stat->targets[target], next);
		return true;
664
	}
665
	return false;
666 667 668 669 670 671
}

/*
 * Check events in order.
 *
 */
672
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
673 674
{
	/* threshold event is triggered in finer grain than soft limit */
675 676
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_THRESH))) {
677
		bool do_softlimit;
678
		bool do_numainfo __maybe_unused;
679

680 681
		do_softlimit = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_SOFTLIMIT);
682 683 684 685
#if MAX_NUMNODES > 1
		do_numainfo = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_NUMAINFO);
#endif
686
		mem_cgroup_threshold(memcg);
687 688
		if (unlikely(do_softlimit))
			mem_cgroup_update_tree(memcg, page);
689
#if MAX_NUMNODES > 1
690
		if (unlikely(do_numainfo))
691
			atomic_inc(&memcg->numainfo_events);
692
#endif
693
	}
694 695
}

696
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
697
{
698 699 700 701 702 703 704 705
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

706
	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
707
}
Michal Hocko's avatar
Michal Hocko committed
708
EXPORT_SYMBOL(mem_cgroup_from_task);
709

710
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
711
{
712
	struct mem_cgroup *memcg = NULL;
713

714 715
	rcu_read_lock();
	do {
716 717 718 719 720 721
		/*
		 * Page cache insertions can happen withou an
		 * actual mm context, e.g. during disk probing
		 * on boot, loopback IO, acct() writes etc.
		 */
		if (unlikely(!mm))
722
			memcg = root_mem_cgroup;
723 724 725 726 727
		else {
			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			if (unlikely(!memcg))
				memcg = root_mem_cgroup;
		}
728
	} while (!css_tryget_online(&memcg->css));
729
	rcu_read_unlock();
730
	return memcg;
731 732
}

733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749
/**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
 * @prev: previously returned memcg, NULL on first invocation
 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 *
 * Returns references to children of the hierarchy below @root, or
 * @root itself, or %NULL after a full round-trip.
 *
 * Caller must pass the return value in @prev on subsequent
 * invocations for reference counting, or use mem_cgroup_iter_break()
 * to cancel a hierarchy walk before the round-trip is complete.
 *
 * Reclaimers can specify a zone and a priority level in @reclaim to
 * divide up the memcgs in the hierarchy among all concurrent
 * reclaimers operating on the same zone and priority.
 */
750
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
751
				   struct mem_cgroup *prev,
752
				   struct mem_cgroup_reclaim_cookie *reclaim)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
753
{
Michal Hocko's avatar
Michal Hocko committed
754
	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
755
	struct cgroup_subsys_state *css = NULL;
756
	struct mem_cgroup *memcg = NULL;
757
	struct mem_cgroup *pos = NULL;
758

759 760
	if (mem_cgroup_disabled())
		return NULL;
761

762 763
	if (!root)
		root = root_mem_cgroup;
764

765
	if (prev && !reclaim)
766
		pos = prev;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
767

768 769
	if (!root->use_hierarchy && root != root_mem_cgroup) {
		if (prev)
770
			goto out;
771
		return root;
772
	}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
773

774
	rcu_read_lock();
Michal Hocko's avatar
Michal Hocko committed
775

776
	if (reclaim) {
777
		struct mem_cgroup_per_node *mz;
778

779
		mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
780 781 782 783 784
		iter = &mz->iter[reclaim->priority];

		if (prev && reclaim->generation != iter->generation)
			goto out_unlock;

785
		while (1) {
786
			pos = READ_ONCE(iter->position);
787 788
			if (!pos || css_tryget(&pos->css))
				break;
789
			/*
790 791 792 793 794 795
			 * css reference reached zero, so iter->position will
			 * be cleared by ->css_released. However, we should not
			 * rely on this happening soon, because ->css_released
			 * is called from a work queue, and by busy-waiting we
			 * might block it. So we clear iter->position right
			 * away.
796
			 */
797 798
			(void)cmpxchg(&iter->position, pos, NULL);
		}
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
	}

	if (pos)
		css = &pos->css;

	for (;;) {
		css = css_next_descendant_pre(css, &root->css);
		if (!css) {
			/*
			 * Reclaimers share the hierarchy walk, and a
			 * new one might jump in right at the end of
			 * the hierarchy - make sure they see at least
			 * one group and restart from the beginning.
			 */
			if (!prev)
				continue;
			break;
816
		}
817

818 819 820 821 822 823
		/*
		 * Verify the css and acquire a reference.  The root
		 * is provided by the caller, so we know it's alive
		 * and kicking, and don't take an extra reference.
		 */
		memcg = mem_cgroup_from_css(css);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
824

825 826
		if (css == &root->css)
			break;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
827

828 829
		if (css_tryget(css))
			break;
830

831
		memcg = NULL;
832
	}
833 834 835

	if (reclaim) {
		/*
836 837 838
		 * The position could have already been updated by a competing
		 * thread, so check that the value hasn't changed since we read
		 * it to avoid reclaiming from the same cgroup twice.
839
		 */
840 841
		(void)cmpxchg(&iter->position, pos, memcg);

842 843 844 845 846 847 848
		if (pos)
			css_put(&pos->css);

		if (!memcg)
			iter->generation++;
		else if (!prev)
			reclaim->generation = iter->generation;
849
	}
850

851 852
out_unlock:
	rcu_read_unlock();
853
out:
854 855 856
	if (prev && prev != root)
		css_put(&prev->css);

857
	return memcg;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
858
}
859

860 861 862 863 864 865 866
/**
 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 * @root: hierarchy root
 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 */
void mem_cgroup_iter_break(struct mem_cgroup *root,
			   struct mem_cgroup *prev)
867 868 869 870 871 872
{
	if (!root)
		root = root_mem_cgroup;
	if (prev && prev != root)
		css_put(&prev->css);
}
873

874 875 876 877
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
	struct mem_cgroup *memcg = dead_memcg;
	struct mem_cgroup_reclaim_iter *iter;
878 879
	struct mem_cgroup_per_node *mz;
	int nid;
880 881 882 883
	int i;

	while ((memcg = parent_mem_cgroup(memcg))) {
		for_each_node(nid) {
884 885 886 887 888
			mz = mem_cgroup_nodeinfo(memcg, nid);
			for (i = 0; i <= DEF_PRIORITY; i++) {
				iter = &mz->iter[i];
				cmpxchg(&iter->position,
					dead_memcg, NULL);
889 890 891 892 893
			}
		}
	}
}

894 895 896 897 898 899
/*
 * Iteration constructs for visiting all cgroups (under a tree).  If
 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 * be used for reference counting.
 */
#define for_each_mem_cgroup_tree(iter, root)		\
900
	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
901
	     iter != NULL;				\
902
	     iter = mem_cgroup_iter(root, iter, NULL))
903

904
#define for_each_mem_cgroup(iter)			\
905
	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
906
	     iter != NULL;				\
907
	     iter = mem_cgroup_iter(NULL, iter, NULL))
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
908

909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933
/**
 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
 * @memcg: hierarchy root
 * @fn: function to call for each task
 * @arg: argument passed to @fn
 *
 * This function iterates over tasks attached to @memcg or to any of its
 * descendants and calls @fn for each task. If @fn returns a non-zero
 * value, the function breaks the iteration loop and returns the value.
 * Otherwise, it will iterate over all tasks and return 0.
 *
 * This function must not be called for the root memory cgroup.
 */
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
			  int (*fn)(struct task_struct *, void *), void *arg)
{
	struct mem_cgroup *iter;
	int ret = 0;

	BUG_ON(memcg == root_mem_cgroup);

	for_each_mem_cgroup_tree(iter, memcg) {
		struct css_task_iter it;
		struct task_struct *task;

934
		css_task_iter_start(&iter->css, 0, &it);
935 936 937 938 939 940 941 942 943 944 945
		while (!ret && (task = css_task_iter_next(&it)))
			ret = fn(task, arg);
		css_task_iter_end(&it);
		if (ret) {
			mem_cgroup_iter_break(memcg, iter);
			break;
		}
	}
	return ret;
}

946
/**
947
 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
948