cgroup.c 171 KB
Newer Older
1
2
3
4
5
6
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
7
8
9
10
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

29
30
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

31
32
#include "cgroup-internal.h"

33
#include <linux/cred.h>
34
#include <linux/errno.h>
35
#include <linux/init_task.h>
36
#include <linux/kernel.h>
37
#include <linux/magic.h>
38
39
40
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
41
#include <linux/proc_fs.h>
42
43
#include <linux/rcupdate.h>
#include <linux/sched.h>
44
#include <linux/sched/task.h>
45
46
#include <linux/slab.h>
#include <linux/spinlock.h>
47
#include <linux/percpu-rwsem.h>
48
#include <linux/string.h>
49
#include <linux/hashtable.h>
50
#include <linux/idr.h>
51
#include <linux/kthread.h>
Arun Sharma's avatar
Arun Sharma committed
52
#include <linux/atomic.h>
53
#include <linux/cpuset.h>
54
55
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
56
#include <linux/file.h>
57
#include <linux/fs_parser.h>
58
#include <linux/sched/cputime.h>
Johannes Weiner's avatar
Johannes Weiner committed
59
#include <linux/psi.h>
Tejun Heo's avatar
Tejun Heo committed
60
#include <net/sock.h>
61

62
63
64
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>

65
66
#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
					 MAX_CFTYPE_NAME + 2)
67
68
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV	DIV_ROUND_UP(HZ, 100)
69

Tejun Heo's avatar
Tejun Heo committed
70
71
72
73
/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
74
 * css_set_lock protects task->cgroups pointer, the list of css_set
75
 * objects, and the chain of tasks off each css_set.
Tejun Heo's avatar
Tejun Heo committed
76
 *
77
78
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
 * cgroup.h can use them for lockdep annotations.
Tejun Heo's avatar
Tejun Heo committed
79
 */
80
DEFINE_MUTEX(cgroup_mutex);
81
DEFINE_SPINLOCK(css_set_lock);
82
83

#ifdef CONFIG_PROVE_RCU
84
EXPORT_SYMBOL_GPL(cgroup_mutex);
85
EXPORT_SYMBOL_GPL(css_set_lock);
86
87
#endif

88
89
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
90
bool cgroup_debug __read_mostly;
91

92
/*
93
94
 * Protects cgroup_idr and css_idr so that IDs can be released without
 * grabbing cgroup_mutex.
95
96
97
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

98
99
100
101
102
103
/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

104
DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
105

Tejun Heo's avatar
Tejun Heo committed
106
#define cgroup_assert_mutex_or_rcu_locked()				\
107
108
	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
			   !lockdep_is_held(&cgroup_mutex),		\
Tejun Heo's avatar
Tejun Heo committed
109
			   "cgroup_mutex or RCU read lock required");
110

111
112
113
114
115
116
117
118
/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 */
static struct workqueue_struct *cgroup_destroy_wq;

Tejun Heo's avatar
Tejun Heo committed
119
/* generate an array of cgroup subsystem pointers */
120
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
121
struct cgroup_subsys *cgroup_subsys[] = {
122
123
#include <linux/cgroup_subsys.h>
};
124
125
126
127
128
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
129
130
#include <linux/cgroup_subsys.h>
};
131
#undef SUBSYS
132

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x)								\
	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

Tejun Heo's avatar
Tejun Heo committed
154
static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
155

156
/*
157
 * The default hierarchy, reserved for the subsystems that are otherwise
158
159
 * unattached - it never has more than a single cgroup, and all tasks are
 * part of that cgroup.
160
 */
Tejun Heo's avatar
Tejun Heo committed
161
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
Tejun Heo's avatar
Tejun Heo committed
162
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
163

164
165
166
167
/*
 * The default hierarchy always exists but is hidden until mounted for the
 * first time.  This is for backward compatibility.
 */
168
static bool cgrp_dfl_visible;
169

170
/* some controllers are not supported in the default hierarchy */
171
static u16 cgrp_dfl_inhibit_ss_mask;
172

173
/* some controllers are implicitly enabled on the default hierarchy */
Tejun Heo's avatar
Tejun Heo committed
174
static u16 cgrp_dfl_implicit_ss_mask;
175

176
177
178
/* some controllers can be threaded on the default hierarchy */
static u16 cgrp_dfl_threaded_ss_mask;

179
/* The list of hierarchy roots */
180
LIST_HEAD(cgroup_roots);
181
static int cgroup_root_count;
182

Tejun Heo's avatar
Tejun Heo committed
183
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
184
static DEFINE_IDR(cgroup_hierarchy_idr);
185

186
/*
187
188
189
190
191
 * Assign a monotonically increasing serial number to csses.  It guarantees
 * cgroups with bigger numbers are newer than those with smaller numbers.
 * Also, as csses are always appended to the parent's ->children list, it
 * guarantees that sibling csses are always sorted in the ascending serial
 * number order on the list.  Protected by cgroup_mutex.
192
 */
193
static u64 css_serial_nr_next = 1;
194

195
/*
Tejun Heo's avatar
Tejun Heo committed
196
197
 * These bitmasks identify subsystems with specific features to avoid
 * having to do iterative checks repeatedly.
198
 */
199
200
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
201
static u16 have_release_callback __read_mostly;
Tejun Heo's avatar
Tejun Heo committed
202
static u16 have_canfork_callback __read_mostly;
203

204
205
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
206
	.count		= REFCOUNT_INIT(2),
207
208
209
210
211
212
	.user_ns	= &init_user_ns,
	.ns.ops		= &cgroupns_operations,
	.ns.inum	= PROC_CGROUP_INIT_INO,
	.root_cset	= &init_css_set,
};

213
static struct file_system_type cgroup2_fs_type;
214
static struct cftype cgroup_base_files[];
215

216
217
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
218
219
static void css_task_iter_skip(struct css_task_iter *it,
			       struct task_struct *task);
220
static int cgroup_destroy_locked(struct cgroup *cgrp);
221
222
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
					      struct cgroup_subsys *ss);
223
static void css_release(struct percpu_ref *ref);
224
static void kill_css(struct cgroup_subsys_state *css);
225
226
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
			      struct cgroup *cgrp, struct cftype cfts[],
227
			      bool is_add);
228

229
230
231
232
233
234
235
236
/**
 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 * @ssid: subsys ID of interest
 *
 * cgroup_subsys_enabled() can only be used with literal subsys names which
 * is fine for individual subsystems but unsuitable for cgroup core.  This
 * is slower static_key_enabled() based test indexed by @ssid.
 */
237
bool cgroup_ssid_enabled(int ssid)
238
{
239
240
241
	if (CGROUP_SUBSYS_COUNT == 0)
		return false;

242
243
244
	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}

245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
/**
 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 * @cgrp: the cgroup of interest
 *
 * The default hierarchy is the v2 interface of cgroup and this function
 * can be used to test whether a cgroup is on the default hierarchy for
 * cases where a subsystem should behave differnetly depending on the
 * interface version.
 *
 * The set of behaviors which change on the default hierarchy are still
 * being determined and the mount option is prefixed with __DEVEL__.
 *
 * List of changed behaviors:
 *
 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 *   and "name" are disallowed.
 *
 * - When mounting an existing superblock, mount options should match.
 *
 * - Remount is disallowed.
 *
 * - rename(2) is disallowed.
 *
 * - "tasks" is removed.  Everything should be at process granularity.  Use
 *   "cgroup.procs" instead.
 *
 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 *   recycled inbetween reads.
 *
 * - "release_agent" and "notify_on_release" are removed.  Replacement
 *   notification mechanism will be implemented.
 *
 * - "cgroup.clone_children" is removed.
 *
 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 *   and its descendants contain no task; otherwise, 1.  The file also
 *   generates kernfs notification which can be monitored through poll and
 *   [di]notify when the value of the file changes.
 *
 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 *   take masks of ancestors with non-empty cpus/mems, instead of being
 *   moved to an ancestor.
 *
 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 *   masks of ancestors.
 *
 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
 *   is not created.
 *
 * - blkcg: blk-throttle becomes properly hierarchical.
 *
 * - debug: disallowed on the default hierarchy.
 */
298
bool cgroup_on_dfl(const struct cgroup *cgrp)
299
300
301
302
{
	return cgrp->root == &cgrp_dfl_root;
}

303
304
305
306
307
308
309
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
			    gfp_t gfp_mask)
{
	int ret;

	idr_preload(gfp_mask);
310
	spin_lock_bh(&cgroup_idr_lock);
311
	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
312
	spin_unlock_bh(&cgroup_idr_lock);
313
314
315
316
317
318
319
320
	idr_preload_end();
	return ret;
}

static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
	void *ret;

321
	spin_lock_bh(&cgroup_idr_lock);
322
	ret = idr_replace(idr, ptr, id);
323
	spin_unlock_bh(&cgroup_idr_lock);
324
325
326
327
328
	return ret;
}

static void cgroup_idr_remove(struct idr *idr, int id)
{
329
	spin_lock_bh(&cgroup_idr_lock);
330
	idr_remove(idr, id);
331
	spin_unlock_bh(&cgroup_idr_lock);
332
333
}

334
static bool cgroup_has_tasks(struct cgroup *cgrp)
Tejun Heo's avatar
Tejun Heo committed
335
{
336
337
	return cgrp->nr_populated_csets;
}
Tejun Heo's avatar
Tejun Heo committed
338

339
bool cgroup_is_threaded(struct cgroup *cgrp)
340
341
342
343
{
	return cgrp->dom_cgrp != cgrp;
}

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
/* can @cgrp host both domain and threaded children? */
static bool cgroup_is_mixable(struct cgroup *cgrp)
{
	/*
	 * Root isn't under domain level resource control exempting it from
	 * the no-internal-process constraint, so it can serve as a thread
	 * root and a parent of resource domains at the same time.
	 */
	return !cgroup_parent(cgrp);
}

/* can @cgrp become a thread root? should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
	/* mixables don't care */
	if (cgroup_is_mixable(cgrp))
		return true;

	/* domain roots can't be nested under threaded */
	if (cgroup_is_threaded(cgrp))
		return false;

	/* can only have either domain or threaded children */
	if (cgrp->nr_populated_domain_children)
		return false;

	/* and no domain controllers can be enabled */
	if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
		return false;

	return true;
}

/* is @cgrp root of a threaded subtree? */
378
bool cgroup_is_thread_root(struct cgroup *cgrp)
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
{
	/* thread root should be a domain */
	if (cgroup_is_threaded(cgrp))
		return false;

	/* a domain w/ threaded children is a thread root */
	if (cgrp->nr_threaded_children)
		return true;

	/*
	 * A domain which has tasks and explicit threaded controllers
	 * enabled is a thread root.
	 */
	if (cgroup_has_tasks(cgrp) &&
	    (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
		return true;

	return false;
}

/* a domain which isn't connected to the root w/o brekage can't be used */
static bool cgroup_is_valid_domain(struct cgroup *cgrp)
{
	/* the cgroup itself can be a thread root */
	if (cgroup_is_threaded(cgrp))
		return false;

	/* but the ancestors can't be unless mixable */
	while ((cgrp = cgroup_parent(cgrp))) {
		if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
			return false;
		if (cgroup_is_threaded(cgrp))
			return false;
	}

	return true;
Tejun Heo's avatar
Tejun Heo committed
415
416
}

417
418
419
420
421
422
/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
	struct cgroup *parent = cgroup_parent(cgrp);
	u16 root_ss_mask = cgrp->root->subsys_mask;

423
424
425
426
427
428
429
430
	if (parent) {
		u16 ss_mask = parent->subtree_control;

		/* threaded cgroups can only have threaded controllers */
		if (cgroup_is_threaded(cgrp))
			ss_mask &= cgrp_dfl_threaded_ss_mask;
		return ss_mask;
	}
431
432

	if (cgroup_on_dfl(cgrp))
433
434
		root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
				  cgrp_dfl_implicit_ss_mask);
435
436
437
438
439
440
441
442
	return root_ss_mask;
}

/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
	struct cgroup *parent = cgroup_parent(cgrp);

443
444
445
446
447
448
449
450
	if (parent) {
		u16 ss_mask = parent->subtree_ss_mask;

		/* threaded cgroups can only have threaded controllers */
		if (cgroup_is_threaded(cgrp))
			ss_mask &= cgrp_dfl_threaded_ss_mask;
		return ss_mask;
	}
451
452
453
454

	return cgrp->root->subsys_mask;
}

Tejun Heo's avatar
Tejun Heo committed
455
456
457
/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
458
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
Tejun Heo's avatar
Tejun Heo committed
459
 *
460
461
462
463
464
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
Tejun Heo's avatar
Tejun Heo committed
465
466
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
467
					      struct cgroup_subsys *ss)
Tejun Heo's avatar
Tejun Heo committed
468
{
469
	if (ss)
470
		return rcu_dereference_check(cgrp->subsys[ss->id],
471
					lockdep_is_held(&cgroup_mutex));
472
	else
473
		return &cgrp->self;
Tejun Heo's avatar
Tejun Heo committed
474
}
475

476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
/**
 * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
 * or is offline, %NULL is returned.
 */
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
						     struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;

	rcu_read_lock();
	css = cgroup_css(cgrp, ss);
	if (!css || !css_tryget_online(css))
		css = NULL;
	rcu_read_unlock();

	return css;
}

498
/**
499
 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
500
 * @cgrp: the cgroup of interest
501
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
502
 *
Chen Hanxiao's avatar
Chen Hanxiao committed
503
 * Similar to cgroup_css() but returns the effective css, which is defined
504
505
506
507
 * as the matching css of the nearest ancestor including self which has @ss
 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 * function is guaranteed to return non-NULL css.
 */
508
509
static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
							struct cgroup_subsys *ss)
510
511
512
513
{
	lockdep_assert_held(&cgroup_mutex);

	if (!ss)
514
		return &cgrp->self;
515

516
517
	/*
	 * This function is used while updating css associations and thus
518
	 * can't test the csses directly.  Test ss_mask.
519
	 */
520
	while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
Tejun Heo's avatar
Tejun Heo committed
521
		cgrp = cgroup_parent(cgrp);
522
523
524
		if (!cgrp)
			return NULL;
	}
525
526

	return cgroup_css(cgrp, ss);
Tejun Heo's avatar
Tejun Heo committed
527
}
528

529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
/**
 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 *
 * The returned css is not guaranteed to be online, and therefore it is the
 * callers responsiblity to tryget a reference for it.
 */
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
					 struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;

	do {
		css = cgroup_css(cgrp, ss);

		if (css)
			return css;
		cgrp = cgroup_parent(cgrp);
	} while (cgrp);

	return init_css_set.subsys[ss->id];
}

558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
/**
 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 * The returned css must be put using css_put().
 */
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
					     struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;

	rcu_read_lock();

	do {
		css = cgroup_css(cgrp, ss);

		if (css && css_tryget_online(css))
			goto out_unlock;
		cgrp = cgroup_parent(cgrp);
	} while (cgrp);

	css = init_css_set.subsys[ss->id];
	css_get(css);
out_unlock:
	rcu_read_unlock();
	return css;
}

591
static void cgroup_get_live(struct cgroup *cgrp)
592
593
594
595
596
{
	WARN_ON_ONCE(cgroup_is_dead(cgrp));
	css_get(&cgrp->self);
}

597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
/**
 * __cgroup_task_count - count the number of tasks in a cgroup. The caller
 * is responsible for taking the css_set_lock.
 * @cgrp: the cgroup in question
 */
int __cgroup_task_count(const struct cgroup *cgrp)
{
	int count = 0;
	struct cgrp_cset_link *link;

	lockdep_assert_held(&css_set_lock);

	list_for_each_entry(link, &cgrp->cset_links, cset_link)
		count += link->cset->nr_tasks;

	return count;
}

/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 */
int cgroup_task_count(const struct cgroup *cgrp)
{
	int count;

	spin_lock_irq(&css_set_lock);
	count = __cgroup_task_count(cgrp);
	spin_unlock_irq(&css_set_lock);

	return count;
}

Tejun Heo's avatar
Tejun Heo committed
630
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
631
{
Tejun Heo's avatar
Tejun Heo committed
632
	struct cgroup *cgrp = of->kn->parent->priv;
Tejun Heo's avatar
Tejun Heo committed
633
	struct cftype *cft = of_cft(of);
Tejun Heo's avatar
Tejun Heo committed
634
635
636
637
638
639
640
641
642
643
644
645

	/*
	 * This is open and unprotected implementation of cgroup_css().
	 * seq_css() is only called from a kernfs file operation which has
	 * an active reference on the file.  Because all the subsystem
	 * files are drained before a css is disassociated with a cgroup,
	 * the matching css from the cgroup's subsys table is guaranteed to
	 * be and stay valid until the enclosing operation is complete.
	 */
	if (cft->ss)
		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
	else
646
		return &cgrp->self;
647
}
Tejun Heo's avatar
Tejun Heo committed
648
EXPORT_SYMBOL_GPL(of_css);
649

Tejun Heo's avatar
Tejun Heo committed
650
651
652
653
654
655
/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
656
 * Should be called under cgroup_[tree_]mutex.
Tejun Heo's avatar
Tejun Heo committed
657
658
659
660
661
662
663
664
 */
#define for_each_css(css, ssid, cgrp)					\
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
		if (!((css) = rcu_dereference_check(			\
				(cgrp)->subsys[(ssid)],			\
				lockdep_is_held(&cgroup_mutex)))) { }	\
		else

665
666
667
668
669
670
671
672
/**
 * for_each_e_css - iterate all effective css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_[tree_]mutex.
 */
673
674
675
676
677
#define for_each_e_css(css, ssid, cgrp)					    \
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	    \
		if (!((css) = cgroup_e_css_by_mask(cgrp,		    \
						   cgroup_subsys[(ssid)]))) \
			;						    \
678
679
		else

680
/**
681
 * do_each_subsys_mask - filter for_each_subsys with a bitmask
682
683
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
684
 * @ss_mask: the bitmask
685
686
 *
 * The block will only run for cases where the ssid-th bit (1 << ssid) of
687
 * @ss_mask is set.
688
 */
689
690
691
#define do_each_subsys_mask(ss, ssid, ss_mask) do {			\
	unsigned long __ss_mask = (ss_mask);				\
	if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */	\
692
		(ssid) = 0;						\
693
694
695
696
697
698
699
700
701
702
		break;							\
	}								\
	for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {	\
		(ss) = cgroup_subsys[ssid];				\
		{

#define while_each_subsys_mask()					\
		}							\
	}								\
} while (false)
703

704
705
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)				\
706
	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
Tejun Heo's avatar
Tejun Heo committed
707
		if (({ lockdep_assert_held(&cgroup_mutex);		\
708
709
710
		       cgroup_is_dead(child); }))			\
			;						\
		else
711

712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
/* walk live descendants in preorder */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

730
731
/*
 * The default css_set - used by init and its children prior to any
732
733
734
735
736
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
737
struct css_set init_css_set = {
738
	.refcount		= REFCOUNT_INIT(1),
739
	.dom_cset		= &init_css_set,
740
741
	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
742
	.dying_tasks		= LIST_HEAD_INIT(init_css_set.dying_tasks),
Tejun Heo's avatar
Tejun Heo committed
743
	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
744
	.threaded_csets		= LIST_HEAD_INIT(init_css_set.threaded_csets),
Tejun Heo's avatar
Tejun Heo committed
745
	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
746
747
	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
748
749
750
751
752
753
754
755

	/*
	 * The following field is re-initialized when this cset gets linked
	 * in cgroup_init().  However, let's initialize the field
	 * statically too so that the default cgroup can be accessed safely
	 * early during boot.
	 */
	.dfl_cgrp		= &cgrp_dfl_root.cgrp,
756
};
757

758
static int css_set_count	= 1;	/* 1 for init_css_set */
759

760
761
762
763
764
static bool css_set_threaded(struct css_set *cset)
{
	return cset->dom_cset != cset;
}

765
766
767
/**
 * css_set_populated - does a css_set contain any tasks?
 * @cset: target css_set
768
769
770
771
772
 *
 * css_set_populated() should be the same as !!cset->nr_tasks at steady
 * state. However, css_set_populated() can be called while a task is being
 * added to or removed from the linked list before the nr_tasks is
 * properly updated. Hence, we can't just look at ->nr_tasks here.
773
774
775
 */
static bool css_set_populated(struct css_set *cset)
{
776
	lockdep_assert_held(&css_set_lock);
777
778
779
780

	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}

781
/**
782
 * cgroup_update_populated - update the populated count of a cgroup
783
784
785
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
786
 * One of the css_sets associated with @cgrp is either getting its first
787
788
789
790
 * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 * count is propagated towards root so that a given cgroup's
 * nr_populated_children is zero iff none of its descendants contain any
 * tasks.
791
 *
792
793
794
795
796
 * @cgrp's interface file "cgroup.populated" is zero if both
 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 * 1 otherwise.  When the sum changes from or to zero, userland is notified
 * that the content of the interface file has changed.  This can be used to
 * detect when @cgrp and its descendants become populated or empty.
797
798
799
 */
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
800
801
802
	struct cgroup *child = NULL;
	int adj = populated ? 1 : -1;

803
	lockdep_assert_held(&css_set_lock);
804
805

	do {
806
		bool was_populated = cgroup_is_populated(cgrp);
807

808
		if (!child) {
809
			cgrp->nr_populated_csets += adj;
810
811
812
813
814
815
		} else {
			if (cgroup_is_threaded(child))
				cgrp->nr_populated_threaded_children += adj;
			else
				cgrp->nr_populated_domain_children += adj;
		}
816

817
		if (was_populated == cgroup_is_populated(cgrp))
818
819
			break;

820
		cgroup1_check_for_release(cgrp);
821
822
		TRACE_CGROUP_PATH(notify_populated, cgrp,
				  cgroup_is_populated(cgrp));
823
824
		cgroup_file_notify(&cgrp->events_file);

825
		child = cgrp;
Tejun Heo's avatar
Tejun Heo committed
826
		cgrp = cgroup_parent(cgrp);
827
828
829
	} while (cgrp);
}

830
831
832
833
834
835
/**
 * css_set_update_populated - update populated state of a css_set
 * @cset: target css_set
 * @populated: whether @cset is populated or depopulated
 *
 * @cset is either getting the first task or losing the last.  Update the
836
 * populated counters of all associated cgroups accordingly.
837
838
839
840
841
 */
static void css_set_update_populated(struct css_set *cset, bool populated)
{
	struct cgrp_cset_link *link;

842
	lockdep_assert_held(&css_set_lock);
843
844
845
846
847

	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
		cgroup_update_populated(link->cgrp, populated);
}

848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
/*
 * @task is leaving, advance task iterators which are pointing to it so
 * that they can resume at the next position.  Advancing an iterator might
 * remove it from the list, use safe walk.  See css_task_iter_skip() for
 * details.
 */
static void css_set_skip_task_iters(struct css_set *cset,
				    struct task_struct *task)
{
	struct css_task_iter *it, *pos;

	list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
		css_task_iter_skip(it, task);
}

863
864
865
866
867
868
869
870
871
872
873
/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
 * @from_cset: css_set @task currently belongs to (may be NULL)
 * @to_cset: new css_set @task is being moved to (may be NULL)
 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 *
 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 * css_set, @from_cset can be NULL.  If @task is being disassociated
 * instead of moved, @to_cset can be NULL.
 *
874
 * This function automatically handles populated counter updates and
875
876
 * css_task_iter adjustments but the caller is responsible for managing
 * @from_cset and @to_cset's reference counts.
877
878
879
880
881
 */
static void css_set_move_task(struct task_struct *task,
			      struct css_set *from_cset, struct css_set *to_cset,
			      bool use_mg_tasks)
{
882
	lockdep_assert_held(&css_set_lock);
883

884
885
886
	if (to_cset && !css_set_populated(to_cset))
		css_set_update_populated(to_cset, true);

887
888
	if (from_cset) {
		WARN_ON_ONCE(list_empty(&task->cg_list));
889

890
		css_set_skip_task_iters(from_cset, task);
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
		list_del_init(&task->cg_list);
		if (!css_set_populated(from_cset))
			css_set_update_populated(from_cset, false);
	} else {
		WARN_ON_ONCE(!list_empty(&task->cg_list));
	}

	if (to_cset) {
		/*
		 * We are synchronized through cgroup_threadgroup_rwsem
		 * against PF_EXITING setting such that we can't race
		 * against cgroup_exit() changing the css_set to
		 * init_css_set and dropping the old one.
		 */
		WARN_ON_ONCE(task->flags & PF_EXITING);

Johannes Weiner's avatar
Johannes Weiner committed
907
		cgroup_move_task(task, to_cset);
908
909
910
911
912
		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
							     &to_cset->tasks);
	}
}

913
914
915
916
917
/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
918
#define CSS_SET_HASH_BITS	7
919
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
920

921
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
922
{
923
	unsigned long key = 0UL;
924
925
	struct cgroup_subsys *ss;
	int i;
926

927
	for_each_subsys(ss, i)
928
929
		key += (unsigned long)css[i];
	key = (key >> 16) ^ key;
930

931
	return key;
932
933
}

934
void put_css_set_locked(struct css_set *cset)
935
{
936
	struct cgrp_cset_link *link, *tmp_link;
937
938
	struct cgroup_subsys *ss;
	int ssid;
939

940
	lockdep_assert_held(&css_set_lock);
941

942
	if (!refcount_dec_and_test(&cset->refcount))
943
		return;
944

945
946
	WARN_ON_ONCE(!list_empty(&cset->threaded_csets));

947
948
	/* This css_set is dead. unlink it and release cgroup and css refs */
	for_each_subsys(ss, ssid) {
949
		list_del(&cset->e_cset_node[ssid]);
950
951
		css_put(cset->subsys[ssid]);
	}
952
	hash_del(&cset->hlist);
953
954
	css_set_count--;

955
956
957
	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
		list_del(&link->cset_link);
		list_del(&link->cgrp_link);
958
959
		if (cgroup_parent(link->cgrp))
			cgroup_put(link->cgrp);
960
		kfree(link);
961
	}
962

963
964
965
966
967
	if (css_set_threaded(cset)) {
		list_del(&cset->threaded_csets_node);
		put_css_set_locked(cset->dom_cset);
	}

968
	kfree_rcu(cset, rcu_head);
969
970
}

971
/**
972
 * compare_css_sets - helper function for find_existing_css_set().
973
974
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
975
976
977
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
Li Zefan's avatar
Li Zefan committed
978
 * Returns true if "cset" matches "old_cset" except for the hierarchy
979
980
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
981
982
static bool compare_css_sets(struct css_set *cset,
			     struct css_set *old_cset,
983
984
985
			     struct cgroup *new_cgrp,
			     struct cgroup_subsys_state *template[])
{
986
	struct cgroup *new_dfl_cgrp;
987
988
	struct list_head *l1, *l2;

989
990
991
992
993
994
	/*
	 * On the default hierarchy, there can be csets which are
	 * associated with the same set of cgroups but different csses.
	 * Let's first ensure that csses match.
	 */
	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
995
996
		return false;

997
998
999
1000
1001
1002
1003
1004
1005
1006

	/* @cset's domain should match the default cgroup's */
	if (cgroup_on_dfl(new_cgrp))
		new_dfl_cgrp = new_cgrp;
	else
		new_dfl_cgrp = old_cset->dfl_cgrp;

	if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
		return false;

1007
1008
	/*
	 * Compare cgroup pointers in order to distinguish between
1009
1010
1011
	 * different cgroups in hierarchies.  As different cgroups may
	 * share the same effective css, this comparison is always
	 * necessary.
1012
	 */
1013
1014
	l1 = &cset->cgrp_links;
	l2 = &old_cset->cgrp_links;
1015
	while (1) {
1016
		struct cgrp_cset_link *link1, *link2;
1017
		struct cgroup *cgrp1, *cgrp2;
1018
1019
1020
1021

		l1 = l1->next;
		l2 = l2->next;
		/* See if we reached the end - both lists are equal length. */
1022
1023
		if (l1 == &cset->cgrp_links) {
			BUG_ON(l2 != &old_cset->cgrp_links);
1024
1025
			break;
		} else {
1026
			BUG_ON(l2 == &old_cset->cgrp_links);
1027
1028
		}
		/* Locate the cgroups associated with these links. */
1029
1030
1031
1032
		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
		cgrp1 = link1->cgrp;
		cgrp2 = link2->cgrp;
1033
		/* Hierarchies should be linked in the same order. */
1034
		BUG_ON(cgrp1->root != cgrp2->root);
1035
1036
1037
1038
1039
1040
1041
1042

		/*
		 * If this hierarchy is the hierarchy of the cgroup
		 * that's changing, then we need to check that this
		 * css_set points to the new cgroup; if it's any other
		 * hierarchy, then this css_set should point to the
		 * same cgroup as the old css_set.
		 */
1043
1044
		if (cgrp1->root == new_cgrp->root) {
			if (cgrp1 != new_cgrp)
1045
1046
				return false;
		} else {
1047
			if (cgrp1 != cgrp2)
1048
1049
1050
1051
1052
1053
				return false;
		}
	}
	return true;
}

1054
1055
1056
1057
1058
/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
1059
 */
1060
1061
1062
static struct css_set *find_existing_css_set(struct css_set *old_cset,
					struct cgroup *cgrp,
					struct cgroup_subsys_state *template[])
1063
{
1064
	struct cgroup_root *root = cgrp->root;
1065
	struct cgroup_subsys *ss;
1066
	struct css_set *cset;
1067
	unsigned long key;
1068
	int i;
1069

Ben Blum's avatar
Ben Blum committed
1070
1071
1072
1073
1074
	/*
	 * Build the set of subsystem state objects that we want to see in the
	 * new css_set. while subsystems can change globally, the entries here
	 * won't change, so no need for locking.
	 */
1075
	for_each_subsys(ss, i) {
1076
		if (root->subsys_mask & (1UL << i)) {
1077
1078
1079
1080
			/*
			 * @ss is in this hierarchy, so we want the
			 * effective css from @cgrp.
			 */
1081
			template[i] = cgroup_e_css_by_mask(cgrp, ss);
1082
		} else {
1083
1084
1085
1086
			/*
			 * @ss is not in this hierarchy, so we don't want
			 * to change the css.
			 */
1087
			template[i] = old_cset->subsys[i];
1088
1089
1090
		}
	}

1091
	key = css_set_hash(template);
1092
1093
	hash_for_each_possible(css_set_table, cset, hlist, key) {
		if (!compare_css_sets(cset, old_cset, cgrp, template))
1094
1095
1096
			continue;

		/* This css_set matches what we need */
1097
		return cset;
1098
	}
1099
1100
1101
1102
1103

	/* No existing cgroup group matched */
	return NULL;
}

1104
static void free_cgrp_cset_links(struct list_head *links_to_free)
1105
{
1106
	struct cgrp_cset_link *link, *tmp_link;
1107

1108
1109
	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
		list_del(&link->cset_link);
1110
1111
1112
1113
		kfree(link);
	}
}

1114
1115
1116
1117
1118
1119
1120
/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
1121
 */
1122
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1123
{
1124
	struct cgrp_cset_link *link;
1125
	int i;
1126
1127
1128

	INIT_LIST_HEAD(tmp_links);

1129
	for (i = 0; i < count; i++) {
1130
		link = kzalloc(sizeof(*link), GFP_KERNEL);
1131
		if (!link) {
1132
			free_cgrp_cset_links(tmp_links);
1133
1134
			return -ENOMEM;
		}
1135
		list_add(&link->cset_link, tmp_links);
1136
1137
1138
1139
	}
	return 0;
}

1140
1141
/**
 * link_css_set - a helper function to link a css_set to a cgroup
1142
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
1143
 * @cset: the css_set to be linked
1144
1145
 * @cgrp: the destination cgroup
 */
1146
1147
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
			 struct cgroup *cgrp)
1148
{
1149
	struct cgrp_cset_link *link;
1150

1151
	BUG_ON(list_empty(tmp_links));
Tejun Heo's avatar
Tejun Heo committed
1152
1153
1154
1155

	if (cgroup_on_dfl(cgrp))
		cset->dfl_cgrp = cgrp;

1156
1157
	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
	link->cset = cset;
1158
	link->cgrp = cgrp;