memcontrol.c 157 KB
Newer Older
1
2
3
4
5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6
7
8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9
10
11
12
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
13
14
15
16
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
17
18
19
20
21
22
 * Native page reclaim
 * Charge lifetime sanitation
 * Lockless page tracking & accounting
 * Unified hierarchy configuration model
 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 *
23
24
25
26
27
28
29
30
31
32
33
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

34
#include <linux/page_counter.h>
35
36
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
37
#include <linux/mm.h>
38
#include <linux/sched/mm.h>
39
#include <linux/shmem_fs.h>
40
#include <linux/hugetlb.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
41
#include <linux/pagemap.h>
42
#include <linux/smp.h>
43
#include <linux/page-flags.h>
44
#include <linux/backing-dev.h>
45
46
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
47
#include <linux/limits.h>
48
#include <linux/export.h>
49
#include <linux/mutex.h>
50
#include <linux/rbtree.h>
51
#include <linux/slab.h>
52
#include <linux/swap.h>
53
#include <linux/swapops.h>
54
#include <linux/spinlock.h>
55
#include <linux/eventfd.h>
56
#include <linux/poll.h>
57
#include <linux/sort.h>
58
#include <linux/fs.h>
59
#include <linux/seq_file.h>
60
#include <linux/vmpressure.h>
61
#include <linux/mm_inline.h>
62
#include <linux/swap_cgroup.h>
63
#include <linux/cpu.h>
64
#include <linux/oom.h>
65
#include <linux/lockdep.h>
66
#include <linux/file.h>
67
#include <linux/tracehook.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
68
#include "internal.h"
Glauber Costa's avatar
Glauber Costa committed
69
#include <net/sock.h>
Michal Hocko's avatar
Michal Hocko committed
70
#include <net/ip.h>
71
#include "slab.h"
72

73
#include <linux/uaccess.h>
74

75
76
#include <trace/events/vmscan.h>

77
78
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
79

80
81
struct mem_cgroup *root_mem_cgroup __read_mostly;

82
#define MEM_CGROUP_RECLAIM_RETRIES	5
83

84
85
86
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;

87
88
89
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem;

90
/* Whether the swap controller is active */
Andrew Morton's avatar
Andrew Morton committed
91
#ifdef CONFIG_MEMCG_SWAP
92
93
int do_swap_account __read_mostly;
#else
94
#define do_swap_account		0
95
96
#endif

97
98
99
100
101
102
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
}

103
static const char *const mem_cgroup_lru_names[] = {
104
105
106
107
108
109
110
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

111
112
113
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024
114

115
116
117
118
119
/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

120
struct mem_cgroup_tree_per_node {
121
	struct rb_root rb_root;
122
	struct rb_node *rb_rightmost;
123
124
125
126
127
128
129
130
131
	spinlock_t lock;
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
132
133
134
135
136
/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};
137

138
139
140
/*
 * cgroup_event represents events which userspace want to receive.
 */
141
struct mem_cgroup_event {
142
	/*
143
	 * memcg which the event belongs to.
144
	 */
145
	struct mem_cgroup *memcg;
146
147
148
149
150
151
152
153
	/*
	 * eventfd to signal userspace about the event.
	 */
	struct eventfd_ctx *eventfd;
	/*
	 * Each of these stored in a list by the cgroup.
	 */
	struct list_head list;
154
155
156
157
158
	/*
	 * register_event() callback will be used to add new userspace
	 * waiter for changes related to this event.  Use eventfd_signal()
	 * on eventfd to send notification to userspace.
	 */
159
	int (*register_event)(struct mem_cgroup *memcg,
Tejun Heo's avatar
Tejun Heo committed
160
			      struct eventfd_ctx *eventfd, const char *args);
161
162
163
164
165
	/*
	 * unregister_event() callback will be called when userspace closes
	 * the eventfd or on cgroup removing.  This callback must be set,
	 * if you want provide notification functionality.
	 */
166
	void (*unregister_event)(struct mem_cgroup *memcg,
167
				 struct eventfd_ctx *eventfd);
168
169
170
171
172
173
	/*
	 * All fields below needed to unregister event when
	 * userspace closes eventfd.
	 */
	poll_table pt;
	wait_queue_head_t *wqh;
174
	wait_queue_entry_t wait;
175
176
177
	struct work_struct remove;
};

178
179
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
180

181
182
/* Stuffs for move charges at task migration. */
/*
183
 * Types of charges to be moved.
184
 */
185
186
187
#define MOVE_ANON	0x1U
#define MOVE_FILE	0x2U
#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
188

189
190
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
191
	spinlock_t	  lock; /* for from, to */
192
	struct mm_struct  *mm;
193
194
	struct mem_cgroup *from;
	struct mem_cgroup *to;
195
	unsigned long flags;
196
	unsigned long precharge;
197
	unsigned long moved_charge;
198
	unsigned long moved_swap;
199
200
201
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
202
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
203
204
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
205

206
207
208
209
/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
210
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
211
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
212

213
214
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215
	MEM_CGROUP_CHARGE_TYPE_ANON,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
216
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
217
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
218
219
220
	NR_CHARGE_TYPE,
};

221
/* for encoding cft->private value on file */
222
223
224
225
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
226
	_KMEM,
Vladimir Davydov's avatar
Vladimir Davydov committed
227
	_TCP,
228
229
};

230
231
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
232
#define MEMFILE_ATTR(val)	((val) & 0xffff)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
233
234
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)
235

236
237
238
239
240
241
242
243
244
245
246
247
248
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

249
250
251
252
253
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
	return (memcg == root_mem_cgroup);
}

254
#ifndef CONFIG_SLOB
255
/*
256
 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
Li Zefan's avatar
Li Zefan committed
257
258
259
260
261
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
262
 *
263
264
 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 * will double each time we have to increase it.
265
 */
266
267
static DEFINE_IDA(memcg_cache_ida);
int memcg_nr_cache_ids;
268

269
270
271
272
273
274
275
276
277
278
279
280
281
/* Protects memcg_nr_cache_ids */
static DECLARE_RWSEM(memcg_cache_ids_sem);

void memcg_get_cache_ids(void)
{
	down_read(&memcg_cache_ids_sem);
}

void memcg_put_cache_ids(void)
{
	up_read(&memcg_cache_ids_sem);
}

282
283
284
285
286
287
/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
Li Zefan's avatar
Li Zefan committed
288
 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
289
290
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
Li Zefan's avatar
Li Zefan committed
291
 * cgrp_id space is not getting any smaller, and we don't have to necessarily
292
293
294
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
Li Zefan's avatar
Li Zefan committed
295
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
296

297
298
299
300
301
302
/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
303
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
304
EXPORT_SYMBOL(memcg_kmem_enabled_key);
305

306
307
struct workqueue_struct *memcg_kmem_cache_wq;

308
#endif /* !CONFIG_SLOB */
309

310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
/**
 * mem_cgroup_css_from_page - css of the memcg associated with a page
 * @page: page of interest
 *
 * If memcg is bound to the default hierarchy, css of the memcg associated
 * with @page is returned.  The returned css remains associated with @page
 * until it is released.
 *
 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 * is returned.
 */
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
	struct mem_cgroup *memcg;

	memcg = page->mem_cgroup;

327
	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
328
329
330
331
332
		memcg = root_mem_cgroup;

	return &memcg->css;
}

333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
/**
 * page_cgroup_ino - return inode number of the memcg a page is charged to
 * @page: the page
 *
 * Look up the closest online ancestor of the memory cgroup @page is charged to
 * and return its inode number or 0 if @page is not charged to any cgroup. It
 * is safe to call this function without holding a reference to @page.
 *
 * Note, this function is inherently racy, because there is nothing to prevent
 * the cgroup inode from getting torn down and potentially reallocated a moment
 * after page_cgroup_ino() returns, so it only should be used by callers that
 * do not care (such as procfs interfaces).
 */
ino_t page_cgroup_ino(struct page *page)
{
	struct mem_cgroup *memcg;
	unsigned long ino = 0;

	rcu_read_lock();
	memcg = READ_ONCE(page->mem_cgroup);
	while (memcg && !(memcg->css.flags & CSS_ONLINE))
		memcg = parent_mem_cgroup(memcg);
	if (memcg)
		ino = cgroup_ino(memcg->css.cgroup);
	rcu_read_unlock();
	return ino;
}

361
362
static struct mem_cgroup_per_node *
mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
363
{
364
	int nid = page_to_nid(page);
365

366
	return memcg->nodeinfo[nid];
367
368
}

369
370
static struct mem_cgroup_tree_per_node *
soft_limit_tree_node(int nid)
371
{
372
	return soft_limit_tree.rb_tree_per_node[nid];
373
374
}

375
static struct mem_cgroup_tree_per_node *
376
377
378
379
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);

380
	return soft_limit_tree.rb_tree_per_node[nid];
381
382
}

383
384
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz,
385
					 unsigned long new_usage_in_excess)
386
387
388
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
389
	struct mem_cgroup_per_node *mz_node;
390
	bool rightmost = true;
391
392
393
394
395
396
397
398
399

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
400
		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
401
					tree_node);
402
		if (mz->usage_in_excess < mz_node->usage_in_excess) {
403
			p = &(*p)->rb_left;
404
405
406
			rightmost = false;
		}

407
408
409
410
411
412
413
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
414
415
416
417

	if (rightmost)
		mctz->rb_rightmost = &mz->tree_node;

418
419
420
421
422
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

423
424
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
					 struct mem_cgroup_tree_per_node *mctz)
425
426
427
{
	if (!mz->on_tree)
		return;
428
429
430
431

	if (&mz->tree_node == mctz->rb_rightmost)
		mctz->rb_rightmost = rb_prev(&mz->tree_node);

432
433
434
435
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

436
437
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
				       struct mem_cgroup_tree_per_node *mctz)
438
{
439
440
441
	unsigned long flags;

	spin_lock_irqsave(&mctz->lock, flags);
442
	__mem_cgroup_remove_exceeded(mz, mctz);
443
	spin_unlock_irqrestore(&mctz->lock, flags);
444
445
}

446
447
448
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
	unsigned long nr_pages = page_counter_read(&memcg->memory);
449
	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
450
451
452
453
454
455
456
	unsigned long excess = 0;

	if (nr_pages > soft_limit)
		excess = nr_pages - soft_limit;

	return excess;
}
457
458
459

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
460
	unsigned long excess;
461
462
	struct mem_cgroup_per_node *mz;
	struct mem_cgroup_tree_per_node *mctz;
463

464
	mctz = soft_limit_tree_from_page(page);
465
466
	if (!mctz)
		return;
467
468
469
470
471
	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
472
		mz = mem_cgroup_page_nodeinfo(memcg, page);
473
		excess = soft_limit_excess(memcg);
474
475
476
477
478
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
479
480
481
			unsigned long flags;

			spin_lock_irqsave(&mctz->lock, flags);
482
483
			/* if on-tree, remove it */
			if (mz->on_tree)
484
				__mem_cgroup_remove_exceeded(mz, mctz);
485
486
487
488
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
489
			__mem_cgroup_insert_exceeded(mz, mctz, excess);
490
			spin_unlock_irqrestore(&mctz->lock, flags);
491
492
493
494
495
496
		}
	}
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
497
498
499
	struct mem_cgroup_tree_per_node *mctz;
	struct mem_cgroup_per_node *mz;
	int nid;
500

501
	for_each_node(nid) {
502
503
		mz = mem_cgroup_nodeinfo(memcg, nid);
		mctz = soft_limit_tree_node(nid);
504
505
		if (mctz)
			mem_cgroup_remove_exceeded(mz, mctz);
506
507
508
	}
}

509
510
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
511
{
512
	struct mem_cgroup_per_node *mz;
513
514
515

retry:
	mz = NULL;
516
	if (!mctz->rb_rightmost)
517
518
		goto done;		/* Nothing to reclaim from */

519
520
	mz = rb_entry(mctz->rb_rightmost,
		      struct mem_cgroup_per_node, tree_node);
521
522
523
524
525
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
526
	__mem_cgroup_remove_exceeded(mz, mctz);
527
	if (!soft_limit_excess(mz->memcg) ||
528
	    !css_tryget_online(&mz->memcg->css))
529
530
531
532
533
		goto retry;
done:
	return mz;
}

534
535
static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
536
{
537
	struct mem_cgroup_per_node *mz;
538

539
	spin_lock_irq(&mctz->lock);
540
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
541
	spin_unlock_irq(&mctz->lock);
542
543
544
	return mz;
}

545
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
546
				      int event)
547
{
548
	return atomic_long_read(&memcg->events[event]);
549
550
}

551
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
552
					 struct page *page,
553
					 bool compound, int nr_pages)
554
{
555
556
557
558
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
559
	if (PageAnon(page))
560
		__mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
561
	else {
562
		__mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
563
		if (PageSwapBacked(page))
564
			__mod_memcg_state(memcg, NR_SHMEM, nr_pages);
565
	}
566

567
568
	if (compound) {
		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
569
		__mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
570
	}
571

572
573
	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
574
		__count_memcg_events(memcg, PGPGIN, 1);
575
	else {
576
		__count_memcg_events(memcg, PGPGOUT, 1);
577
578
		nr_pages = -nr_pages; /* for event */
	}
579

580
	__this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
581
582
}

583
584
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
					   int nid, unsigned int lru_mask)
585
{
586
	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
587
	unsigned long nr = 0;
588
	enum lru_list lru;
589

590
	VM_BUG_ON((unsigned)nid >= nr_node_ids);
591

592
593
594
	for_each_lru(lru) {
		if (!(BIT(lru) & lru_mask))
			continue;
595
		nr += mem_cgroup_get_lru_size(lruvec, lru);
596
597
	}
	return nr;
598
}
599

600
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
601
			unsigned int lru_mask)
602
{
603
	unsigned long nr = 0;
604
	int nid;
605

606
	for_each_node_state(nid, N_MEMORY)
607
608
		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
	return nr;
609
610
}

611
612
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
613
614
615
{
	unsigned long val, next;

616
617
	val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
	next = __this_cpu_read(memcg->stat_cpu->targets[target]);
618
	/* from time_after() in jiffies.h */
619
	if ((long)(next - val) < 0) {
620
621
622
623
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
624
625
626
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;
			break;
627
628
629
630
631
632
		case MEM_CGROUP_TARGET_NUMAINFO:
			next = val + NUMAINFO_EVENTS_TARGET;
			break;
		default:
			break;
		}
633
		__this_cpu_write(memcg->stat_cpu->targets[target], next);
634
		return true;
635
	}
636
	return false;
637
638
639
640
641
642
}

/*
 * Check events in order.
 *
 */
643
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
644
645
{
	/* threshold event is triggered in finer grain than soft limit */
646
647
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_THRESH))) {
648
		bool do_softlimit;
649
		bool do_numainfo __maybe_unused;
650

651
652
		do_softlimit = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_SOFTLIMIT);
653
654
655
656
#if MAX_NUMNODES > 1
		do_numainfo = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_NUMAINFO);
#endif
657
		mem_cgroup_threshold(memcg);
658
659
		if (unlikely(do_softlimit))
			mem_cgroup_update_tree(memcg, page);
660
#if MAX_NUMNODES > 1
661
		if (unlikely(do_numainfo))
662
			atomic_inc(&memcg->numainfo_events);
663
#endif
664
	}
665
666
}

667
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
668
{
669
670
671
672
673
674
675
676
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

677
	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
678
}
Michal Hocko's avatar
Michal Hocko committed
679
EXPORT_SYMBOL(mem_cgroup_from_task);
680

681
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
682
{
683
	struct mem_cgroup *memcg = NULL;
684

685
686
	rcu_read_lock();
	do {
687
688
689
690
691
692
		/*
		 * Page cache insertions can happen withou an
		 * actual mm context, e.g. during disk probing
		 * on boot, loopback IO, acct() writes etc.
		 */
		if (unlikely(!mm))
693
			memcg = root_mem_cgroup;
694
695
696
697
698
		else {
			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			if (unlikely(!memcg))
				memcg = root_mem_cgroup;
		}
699
	} while (!css_tryget_online(&memcg->css));
700
	rcu_read_unlock();
701
	return memcg;
702
703
}

704
705
706
707
708
709
710
711
712
713
714
715
716
/**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
 * @prev: previously returned memcg, NULL on first invocation
 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 *
 * Returns references to children of the hierarchy below @root, or
 * @root itself, or %NULL after a full round-trip.
 *
 * Caller must pass the return value in @prev on subsequent
 * invocations for reference counting, or use mem_cgroup_iter_break()
 * to cancel a hierarchy walk before the round-trip is complete.
 *
717
 * Reclaimers can specify a node and a priority level in @reclaim to
718
 * divide up the memcgs in the hierarchy among all concurrent
719
 * reclaimers operating on the same node and priority.
720
 */
721
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
722
				   struct mem_cgroup *prev,
723
				   struct mem_cgroup_reclaim_cookie *reclaim)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
724
{
Michal Hocko's avatar
Michal Hocko committed
725
	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
726
	struct cgroup_subsys_state *css = NULL;
727
	struct mem_cgroup *memcg = NULL;
728
	struct mem_cgroup *pos = NULL;
729

730
731
	if (mem_cgroup_disabled())
		return NULL;
732

733
734
	if (!root)
		root = root_mem_cgroup;
735

736
	if (prev && !reclaim)
737
		pos = prev;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
738

739
740
	if (!root->use_hierarchy && root != root_mem_cgroup) {
		if (prev)
741
			goto out;
742
		return root;
743
	}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
744

745
	rcu_read_lock();
Michal Hocko's avatar
Michal Hocko committed
746

747
	if (reclaim) {
748
		struct mem_cgroup_per_node *mz;
749

750
		mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
751
752
753
754
755
		iter = &mz->iter[reclaim->priority];

		if (prev && reclaim->generation != iter->generation)
			goto out_unlock;

756
		while (1) {
757
			pos = READ_ONCE(iter->position);
758
759
			if (!pos || css_tryget(&pos->css))
				break;
760
			/*
761
762
763
764
765
766
			 * css reference reached zero, so iter->position will
			 * be cleared by ->css_released. However, we should not
			 * rely on this happening soon, because ->css_released
			 * is called from a work queue, and by busy-waiting we
			 * might block it. So we clear iter->position right
			 * away.
767
			 */
768
769
			(void)cmpxchg(&iter->position, pos, NULL);
		}
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
	}

	if (pos)
		css = &pos->css;

	for (;;) {
		css = css_next_descendant_pre(css, &root->css);
		if (!css) {
			/*
			 * Reclaimers share the hierarchy walk, and a
			 * new one might jump in right at the end of
			 * the hierarchy - make sure they see at least
			 * one group and restart from the beginning.
			 */
			if (!prev)
				continue;
			break;
787
		}
788

789
790
791
792
793
794
		/*
		 * Verify the css and acquire a reference.  The root
		 * is provided by the caller, so we know it's alive
		 * and kicking, and don't take an extra reference.
		 */
		memcg = mem_cgroup_from_css(css);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
795

796
797
		if (css == &root->css)
			break;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
798

799
800
		if (css_tryget(css))
			break;
801

802
		memcg = NULL;
803
	}
804
805
806

	if (reclaim) {
		/*
807
808
809
		 * The position could have already been updated by a competing
		 * thread, so check that the value hasn't changed since we read
		 * it to avoid reclaiming from the same cgroup twice.
810
		 */
811
812
		(void)cmpxchg(&iter->position, pos, memcg);

813
814
815
816
817
818
819
		if (pos)
			css_put(&pos->css);

		if (!memcg)
			iter->generation++;
		else if (!prev)
			reclaim->generation = iter->generation;
820
	}
821

822
823
out_unlock:
	rcu_read_unlock();
824
out:
825
826
827
	if (prev && prev != root)
		css_put(&prev->css);

828
	return memcg;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
829
}
830

831
832
833
834
835
836
837
/**
 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 * @root: hierarchy root
 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 */
void mem_cgroup_iter_break(struct mem_cgroup *root,
			   struct mem_cgroup *prev)
838
839
840
841
842
843
{
	if (!root)
		root = root_mem_cgroup;
	if (prev && prev != root)
		css_put(&prev->css);
}
844

845
846
847
848
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
	struct mem_cgroup *memcg = dead_memcg;
	struct mem_cgroup_reclaim_iter *iter;
849
850
	struct mem_cgroup_per_node *mz;
	int nid;
851
852
853
854
	int i;

	while ((memcg = parent_mem_cgroup(memcg))) {
		for_each_node(nid) {
855
856
857
858
859
			mz = mem_cgroup_nodeinfo(memcg, nid);
			for (i = 0; i <= DEF_PRIORITY; i++) {
				iter = &mz->iter[i];
				cmpxchg(&iter->position,
					dead_memcg, NULL);
860
861
862
863
864
			}
		}
	}
}

865
866
867
868
869
870
/*
 * Iteration constructs for visiting all cgroups (under a tree).  If
 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 * be used for reference counting.
 */
#define for_each_mem_cgroup_tree(iter, root)		\
871
	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
872
	     iter != NULL;				\
873
	     iter = mem_cgroup_iter(root, iter, NULL))
874

875
#define for_each_mem_cgroup(iter)			\
876
	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
877
	     iter != NULL;				\
878
	     iter = mem_cgroup_iter(NULL, iter, NULL))
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
879

880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
/**
 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
 * @memcg: hierarchy root
 * @fn: function to call for each task
 * @arg: argument passed to @fn
 *
 * This function iterates over tasks attached to @memcg or to any of its
 * descendants and calls @fn for each task. If @fn returns a non-zero
 * value, the function breaks the iteration loop and returns the value.
 * Otherwise, it will iterate over all tasks and return 0.
 *
 * This function must not be called for the root memory cgroup.
 */
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
			  int (*fn)(struct task_struct *, void *), void *arg)
{
	struct mem_cgroup *iter;
	int ret = 0;

	BUG_ON(memcg == root_mem_cgroup);

	for_each_mem_cgroup_tree(iter, memcg) {
		struct css_task_iter it;
		struct task_struct *task;

905
		css_task_iter_start(&iter->css, 0, &it);
906
907
908
909
910
911
912
913
914
915
916
		while (!ret && (task = css_task_iter_next(&it)))
			ret = fn(task, arg);
		css_task_iter_end(&it);
		if (ret) {
			mem_cgroup_iter_break(memcg, iter);
			break;
		}
	}
	return ret;
}

917
/**
918
 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
919
 * @page: the page
920
 * @pgdat: pgdat of the page
921
922
923
924
 *
 * This function is only safe when following the LRU page isolation
 * and putback protocol: the LRU lock must be held, and the page must
 * either be PageLRU() or the caller must have isolated/allocated it.
925
 */
926
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
927
{
928
	struct mem_cgroup_per_node *mz;
929
	struct mem_cgroup *memcg;
930
	struct lruvec *lruvec;
931

932
	if (mem_cgroup_disabled()) {
933
		lruvec = &pgdat->lruvec;
934
935
		goto out;
	}
936

937
	memcg = page->mem_cgroup;
938
	/*
939
	 * Swapcache readahead pages are added to the LRU - and
940
	 * possibly migrated - before they are charged.
941
	 */
942
943
	if (!memcg)
		memcg = root_mem_cgroup;
944