cgroup-v1.c 33.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2
3
#include "cgroup-internal.h"

4
#include <linux/ctype.h>
5
6
#include <linux/kmod.h>
#include <linux/sort.h>
7
#include <linux/delay.h>
8
#include <linux/mm.h>
9
#include <linux/sched/signal.h>
10
#include <linux/sched/task.h>
11
#include <linux/magic.h>
12
13
14
15
16
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
#include <linux/cgroupstats.h>
17
#include <linux/fs_parser.h>
18
19
20

#include <trace/events/cgroup.h>

21
#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
22

23
24
25
26
27
28
29
30
31
32
33
/*
 * pidlists linger the following amount before being destroyed.  The goal
 * is avoiding frequent destruction in the middle of consecutive read calls
 * Expiring in the middle is a performance problem not a correctness one.
 * 1 sec should be enough.
 */
#define CGROUP_PIDLIST_DESTROY_DELAY	HZ

/* Controllers blocked by the commandline in v1 */
static u16 cgroup_no_v1_mask;

34
35
36
/* disable named v1 mounts */
static bool cgroup_no_v1_named;

37
38
39
40
41
42
43
44
45
46
/*
 * pidlist destructions need to be flushed on cgroup destruction.  Use a
 * separate workqueue as flush domain.
 */
static struct workqueue_struct *cgroup_pidlist_destroy_wq;

/*
 * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 */
47
static DEFINE_SPINLOCK(release_agent_path_lock);
48

49
bool cgroup1_ssid_disabled(int ssid)
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
{
	return cgroup_no_v1_mask & (1 << ssid);
}

/**
 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
 * @from: attach to all cgroups of a given task
 * @tsk: the task to be attached
 */
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
	struct cgroup_root *root;
	int retval = 0;

	mutex_lock(&cgroup_mutex);
	percpu_down_write(&cgroup_threadgroup_rwsem);
	for_each_root(root) {
		struct cgroup *from_cgrp;

		if (root == &cgrp_dfl_root)
			continue;

		spin_lock_irq(&css_set_lock);
		from_cgrp = task_cgroup_from_root(from, root);
		spin_unlock_irq(&css_set_lock);

		retval = cgroup_attach_task(from_cgrp, tsk, false);
		if (retval)
			break;
	}
	percpu_up_write(&cgroup_threadgroup_rwsem);
	mutex_unlock(&cgroup_mutex);

	return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

/**
 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
 * @to: cgroup to which the tasks will be moved
 * @from: cgroup in which the tasks currently reside
 *
 * Locking rules between cgroup_post_fork() and the migration path
 * guarantee that, if a task is forking while being migrated, the new child
 * is guaranteed to be either visible in the source cgroup after the
 * parent's migration is complete or put into the target cgroup.  No task
 * can slip out of migration through forking.
 */
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
100
	DEFINE_CGROUP_MGCTX(mgctx);
101
102
103
104
105
106
107
108
	struct cgrp_cset_link *link;
	struct css_task_iter it;
	struct task_struct *task;
	int ret;

	if (cgroup_on_dfl(to))
		return -EINVAL;

109
110
111
	ret = cgroup_migrate_vet_dst(to);
	if (ret)
		return ret;
112
113
114
115
116
117
118
119

	mutex_lock(&cgroup_mutex);

	percpu_down_write(&cgroup_threadgroup_rwsem);

	/* all tasks in @from are being moved, all csets are source */
	spin_lock_irq(&css_set_lock);
	list_for_each_entry(link, &from->cset_links, cset_link)
120
		cgroup_migrate_add_src(link->cset, to, &mgctx);
121
122
	spin_unlock_irq(&css_set_lock);

123
	ret = cgroup_migrate_prepare_dst(&mgctx);
124
125
126
127
128
129
130
131
	if (ret)
		goto out_err;

	/*
	 * Migrate tasks one-by-one until @from is empty.  This fails iff
	 * ->can_attach() fails.
	 */
	do {
132
		css_task_iter_start(&from->self, 0, &it);
133
134
135
136
137

		do {
			task = css_task_iter_next(&it);
		} while (task && (task->flags & PF_EXITING));

138
139
140
141
142
		if (task)
			get_task_struct(task);
		css_task_iter_end(&it);

		if (task) {
143
			ret = cgroup_migrate(task, false, &mgctx);
144
			if (!ret)
145
				TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
146
147
148
149
			put_task_struct(task);
		}
	} while (task && !ret);
out_err:
150
	cgroup_migrate_finish(&mgctx);
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
	percpu_up_write(&cgroup_threadgroup_rwsem);
	mutex_unlock(&cgroup_mutex);
	return ret;
}

/*
 * Stuff for reading the 'tasks'/'procs' files.
 *
 * Reading this file can return large amounts of data if a cgroup has
 * *lots* of attached tasks. So it may need several calls to read(),
 * but we cannot guarantee that the information we produce is correct
 * unless we produce it entirely atomically.
 *
 */

/* which pidlist file are we talking about? */
enum cgroup_filetype {
	CGROUP_FILE_PROCS,
	CGROUP_FILE_TASKS,
};

/*
 * A pidlist is a list of pids that virtually represents the contents of one
 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
 * a pair (one each for procs, tasks) for each pid namespace that's relevant
 * to the cgroup.
 */
struct cgroup_pidlist {
	/*
	 * used to find which pidlist is wanted. doesn't change as long as
	 * this particular list stays in the list.
	*/
	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
	/* array of xids */
	pid_t *list;
	/* how many elements the above list has */
	int length;
	/* each of these stored in a list by its cgroup */
	struct list_head links;
	/* pointer to the cgroup we belong to, for list removal purposes */
	struct cgroup *owner;
	/* for delayed destruction */
	struct delayed_work destroy_dwork;
};

/*
 * Used to destroy all pidlists lingering waiting for destroy timer.  None
 * should be left afterwards.
 */
200
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
{
	struct cgroup_pidlist *l, *tmp_l;

	mutex_lock(&cgrp->pidlist_mutex);
	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
	mutex_unlock(&cgrp->pidlist_mutex);

	flush_workqueue(cgroup_pidlist_destroy_wq);
	BUG_ON(!list_empty(&cgrp->pidlists));
}

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
	struct delayed_work *dwork = to_delayed_work(work);
	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
						destroy_dwork);
	struct cgroup_pidlist *tofree = NULL;

	mutex_lock(&l->owner->pidlist_mutex);

	/*
	 * Destroy iff we didn't get queued again.  The state won't change
	 * as destroy_dwork can only be queued while locked.
	 */
	if (!delayed_work_pending(dwork)) {
		list_del(&l->links);
Marc Koderer's avatar
Marc Koderer committed
228
		kvfree(l->list);
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
		put_pid_ns(l->key.ns);
		tofree = l;
	}

	mutex_unlock(&l->owner->pidlist_mutex);
	kfree(tofree);
}

/*
 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
 * Returns the number of unique elements.
 */
static int pidlist_uniq(pid_t *list, int length)
{
	int src, dest = 1;

	/*
	 * we presume the 0th element is unique, so i starts at 1. trivial
	 * edge cases first; no work needs to be done for either
	 */
	if (length == 0 || length == 1)
		return length;
	/* src and dest walk down the list; dest counts unique elements */
	for (src = 1; src < length; src++) {
		/* find next unique element */
		while (list[src] == list[src-1]) {
			src++;
			if (src == length)
				goto after;
		}
		/* dest always points to where the next unique element goes */
		list[dest] = list[src];
		dest++;
	}
after:
	return dest;
}

/*
 * The two pid files - task and cgroup.procs - guaranteed that the result
 * is sorted, which forced this whole pidlist fiasco.  As pid order is
 * different per namespace, each namespace needs differently sorted list,
 * making it impossible to use, for example, single rbtree of member tasks
 * sorted by task pointer.  As pidlists can be fairly large, allocating one
 * per open file is dangerous, so cgroup had to implement shared pool of
 * pidlists keyed by cgroup and namespace.
 */
static int cmppid(const void *a, const void *b)
{
	return *(pid_t *)a - *(pid_t *)b;
}

static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
						  enum cgroup_filetype type)
{
	struct cgroup_pidlist *l;
	/* don't need task_nsproxy() if we're looking at ourself */
	struct pid_namespace *ns = task_active_pid_ns(current);

	lockdep_assert_held(&cgrp->pidlist_mutex);

	list_for_each_entry(l, &cgrp->pidlists, links)
		if (l->key.type == type && l->key.ns == ns)
			return l;
	return NULL;
}

/*
 * find the appropriate pidlist for our purpose (given procs vs tasks)
 * returns with the lock on that pidlist already held, and takes care
 * of the use count, or returns NULL with no locks held if we're out of
 * memory.
 */
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
						enum cgroup_filetype type)
{
	struct cgroup_pidlist *l;

	lockdep_assert_held(&cgrp->pidlist_mutex);

	l = cgroup_pidlist_find(cgrp, type);
	if (l)
		return l;

	/* entry not found; create a new one */
	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
	if (!l)
		return l;

	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
	l->key.type = type;
	/* don't need task_nsproxy() if we're looking at ourself */
	l->key.ns = get_pid_ns(task_active_pid_ns(current));
	l->owner = cgrp;
	list_add(&l->links, &cgrp->pidlists);
	return l;
}

/*
 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
 */
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
			      struct cgroup_pidlist **lp)
{
	pid_t *array;
	int length;
	int pid, n = 0; /* used for populating the array */
	struct css_task_iter it;
	struct task_struct *tsk;
	struct cgroup_pidlist *l;

	lockdep_assert_held(&cgrp->pidlist_mutex);

	/*
	 * If cgroup gets more users after we read count, we won't have
	 * enough space - tough.  This race is indistinguishable to the
	 * caller from the case that the additional cgroup users didn't
	 * show up until sometime later on.
	 */
	length = cgroup_task_count(cgrp);
Marc Koderer's avatar
Marc Koderer committed
349
	array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
350
351
352
	if (!array)
		return -ENOMEM;
	/* now, populate the array */
353
	css_task_iter_start(&cgrp->self, 0, &it);
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
	while ((tsk = css_task_iter_next(&it))) {
		if (unlikely(n == length))
			break;
		/* get tgid or pid for procs or tasks file respectively */
		if (type == CGROUP_FILE_PROCS)
			pid = task_tgid_vnr(tsk);
		else
			pid = task_pid_vnr(tsk);
		if (pid > 0) /* make sure to only use valid results */
			array[n++] = pid;
	}
	css_task_iter_end(&it);
	length = n;
	/* now sort & (if procs) strip out duplicates */
	sort(array, length, sizeof(pid_t), cmppid, NULL);
	if (type == CGROUP_FILE_PROCS)
		length = pidlist_uniq(array, length);

	l = cgroup_pidlist_find_create(cgrp, type);
	if (!l) {
Marc Koderer's avatar
Marc Koderer committed
374
		kvfree(array);
375
376
377
378
		return -ENOMEM;
	}

	/* store array, freeing old if necessary */
Marc Koderer's avatar
Marc Koderer committed
379
	kvfree(l->list);
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
	l->list = array;
	l->length = length;
	*lp = l;
	return 0;
}

/*
 * seq_file methods for the tasks/procs files. The seq_file position is the
 * next pid to display; the seq_file iterator is a pointer to the pid
 * in the cgroup->l->list array.
 */

static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
{
	/*
	 * Initially we receive a position value that corresponds to
	 * one more than the last pid shown (or 0 on the first call or
	 * after a seek to the start). Use a binary-search to find the
	 * next pid to display, if any
	 */
	struct kernfs_open_file *of = s->private;
	struct cgroup *cgrp = seq_css(s)->cgroup;
	struct cgroup_pidlist *l;
	enum cgroup_filetype type = seq_cft(s)->private;
	int index = 0, pid = *pos;
	int *iter, ret;

	mutex_lock(&cgrp->pidlist_mutex);

	/*
	 * !NULL @of->priv indicates that this isn't the first start()
	 * after open.  If the matching pidlist is around, we can use that.
	 * Look for it.  Note that @of->priv can't be used directly.  It
	 * could already have been destroyed.
	 */
	if (of->priv)
		of->priv = cgroup_pidlist_find(cgrp, type);

	/*
	 * Either this is the first start() after open or the matching
	 * pidlist has been destroyed inbetween.  Create a new one.
	 */
	if (!of->priv) {
		ret = pidlist_array_load(cgrp, type,
					 (struct cgroup_pidlist **)&of->priv);
		if (ret)
			return ERR_PTR(ret);
	}
	l = of->priv;

	if (pid) {
		int end = l->length;

		while (index < end) {
			int mid = (index + end) / 2;
			if (l->list[mid] == pid) {
				index = mid;
				break;
			} else if (l->list[mid] <= pid)
				index = mid + 1;
			else
				end = mid;
		}
	}
	/* If we're off the end of the array, we're done */
	if (index >= l->length)
		return NULL;
	/* Update the abstract position to be the actual pid that we found */
	iter = l->list + index;
	*pos = *iter;
	return iter;
}

static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
	struct kernfs_open_file *of = s->private;
	struct cgroup_pidlist *l = of->priv;

	if (l)
		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
				 CGROUP_PIDLIST_DESTROY_DELAY);
	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}

static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
	struct kernfs_open_file *of = s->private;
	struct cgroup_pidlist *l = of->priv;
	pid_t *p = v;
	pid_t *end = l->list + l->length;
	/*
	 * Advance to the next pid in the array. If this goes off the
	 * end, we're done
	 */
	p++;
	if (p >= end) {
		return NULL;
	} else {
		*pos = *p;
		return p;
	}
}

static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
	seq_printf(s, "%d\n", *(int *)v);

	return 0;
}

490
491
492
static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
				     char *buf, size_t nbytes, loff_t off,
				     bool threadgroup)
493
{
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
	struct cgroup *cgrp;
	struct task_struct *task;
	const struct cred *cred, *tcred;
	ssize_t ret;

	cgrp = cgroup_kn_lock_live(of->kn, false);
	if (!cgrp)
		return -ENODEV;

	task = cgroup_procs_write_start(buf, threadgroup);
	ret = PTR_ERR_OR_ZERO(task);
	if (ret)
		goto out_unlock;

	/*
	 * Even if we're attaching all tasks in the thread group, we only
	 * need to check permissions on one of them.
	 */
	cred = current_cred();
	tcred = get_task_cred(task);
	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
	    !uid_eq(cred->euid, tcred->uid) &&
	    !uid_eq(cred->euid, tcred->suid))
		ret = -EACCES;
	put_cred(tcred);
	if (ret)
		goto out_finish;

	ret = cgroup_attach_task(cgrp, task, threadgroup);

out_finish:
	cgroup_procs_write_finish(task);
out_unlock:
	cgroup_kn_unlock(of->kn);

	return ret ?: nbytes;
}

static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
				   char *buf, size_t nbytes, loff_t off)
{
	return __cgroup1_procs_write(of, buf, nbytes, off, true);
}

static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
				   char *buf, size_t nbytes, loff_t off)
{
	return __cgroup1_procs_write(of, buf, nbytes, off, false);
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
}

static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
					  char *buf, size_t nbytes, loff_t off)
{
	struct cgroup *cgrp;

	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);

	cgrp = cgroup_kn_lock_live(of->kn, false);
	if (!cgrp)
		return -ENODEV;
	spin_lock(&release_agent_path_lock);
	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
		sizeof(cgrp->root->release_agent_path));
	spin_unlock(&release_agent_path_lock);
	cgroup_kn_unlock(of->kn);
	return nbytes;
}

static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
	struct cgroup *cgrp = seq_css(seq)->cgroup;

	spin_lock(&release_agent_path_lock);
	seq_puts(seq, cgrp->root->release_agent_path);
	spin_unlock(&release_agent_path_lock);
	seq_putc(seq, '\n');
	return 0;
}

static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
	seq_puts(seq, "0\n");
	return 0;
}

static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
					 struct cftype *cft)
{
	return notify_on_release(css->cgroup);
}

static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
					  struct cftype *cft, u64 val)
{
	if (val)
		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
	else
		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
	return 0;
}

static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
				      struct cftype *cft)
{
	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
}

static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
				       struct cftype *cft, u64 val)
{
	if (val)
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
	else
		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
	return 0;
}

/* cgroup core interface files for the legacy hierarchies */
612
struct cftype cgroup1_base_files[] = {
613
614
615
616
617
618
619
	{
		.name = "cgroup.procs",
		.seq_start = cgroup_pidlist_start,
		.seq_next = cgroup_pidlist_next,
		.seq_stop = cgroup_pidlist_stop,
		.seq_show = cgroup_pidlist_show,
		.private = CGROUP_FILE_PROCS,
620
		.write = cgroup1_procs_write,
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
	},
	{
		.name = "cgroup.clone_children",
		.read_u64 = cgroup_clone_children_read,
		.write_u64 = cgroup_clone_children_write,
	},
	{
		.name = "cgroup.sane_behavior",
		.flags = CFTYPE_ONLY_ON_ROOT,
		.seq_show = cgroup_sane_behavior_show,
	},
	{
		.name = "tasks",
		.seq_start = cgroup_pidlist_start,
		.seq_next = cgroup_pidlist_next,
		.seq_stop = cgroup_pidlist_stop,
		.seq_show = cgroup_pidlist_show,
		.private = CGROUP_FILE_TASKS,
639
		.write = cgroup1_tasks_write,
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
	},
	{
		.name = "notify_on_release",
		.read_u64 = cgroup_read_notify_on_release,
		.write_u64 = cgroup_write_notify_on_release,
	},
	{
		.name = "release_agent",
		.flags = CFTYPE_ONLY_ON_ROOT,
		.seq_show = cgroup_release_agent_show,
		.write = cgroup_release_agent_write,
		.max_write_len = PATH_MAX - 1,
	},
	{ }	/* terminate */
};

/* Display information about each subsystem and each hierarchy */
657
int proc_cgroupstats_show(struct seq_file *m, void *v)
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
{
	struct cgroup_subsys *ss;
	int i;

	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
	/*
	 * ideally we don't want subsystems moving around while we do this.
	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
	 * subsys/hierarchy state.
	 */
	mutex_lock(&cgroup_mutex);

	for_each_subsys(ss, i)
		seq_printf(m, "%s\t%d\t%d\t%d\n",
			   ss->legacy_name, ss->root->hierarchy_id,
			   atomic_read(&ss->root->nr_cgrps),
			   cgroup_ssid_enabled(i));

	mutex_unlock(&cgroup_mutex);
	return 0;
}

/**
 * cgroupstats_build - build and fill cgroupstats
 * @stats: cgroupstats to fill information into
 * @dentry: A dentry entry belonging to the cgroup for which stats have
 * been requested.
 *
 * Build and fill cgroupstats so that taskstats can export it to user
 * space.
 */
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
	struct cgroup *cgrp;
	struct css_task_iter it;
	struct task_struct *tsk;

	/* it should be kernfs_node belonging to cgroupfs and is a directory */
	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
	    kernfs_type(kn) != KERNFS_DIR)
		return -EINVAL;

	mutex_lock(&cgroup_mutex);

	/*
	 * We aren't being called from kernfs and there's no guarantee on
	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
	 */
	rcu_read_lock();
709
	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
710
711
712
713
714
715
716
	if (!cgrp || cgroup_is_dead(cgrp)) {
		rcu_read_unlock();
		mutex_unlock(&cgroup_mutex);
		return -ENOENT;
	}
	rcu_read_unlock();

717
	css_task_iter_start(&cgrp->self, 0, &it);
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
	while ((tsk = css_task_iter_next(&it))) {
		switch (tsk->state) {
		case TASK_RUNNING:
			stats->nr_running++;
			break;
		case TASK_INTERRUPTIBLE:
			stats->nr_sleeping++;
			break;
		case TASK_UNINTERRUPTIBLE:
			stats->nr_uninterruptible++;
			break;
		case TASK_STOPPED:
			stats->nr_stopped++;
			break;
		default:
			if (delayacct_is_task_waiting_on_io(tsk))
				stats->nr_io_wait++;
			break;
		}
	}
	css_task_iter_end(&it);

	mutex_unlock(&cgroup_mutex);
	return 0;
}

744
void cgroup1_check_for_release(struct cgroup *cgrp)
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
{
	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
		schedule_work(&cgrp->release_agent_work);
}

/*
 * Notify userspace when a cgroup is released, by running the
 * configured release agent with the name of the cgroup (path
 * relative to the root of cgroup file system) as the argument.
 *
 * Most likely, this user command will try to rmdir this cgroup.
 *
 * This races with the possibility that some other task will be
 * attached to this cgroup before it is removed, or that some other
 * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
 * unused, and this cgroup will be reprieved from its death sentence,
 * to continue to serve a useful existence.  Next time it's released,
 * we will get notified again, if it still has 'notify_on_release' set.
 *
 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
 * means only wait until the task is successfully execve()'d.  The
 * separate release agent task is forked by call_usermodehelper(),
 * then control in this thread returns here, without waiting for the
 * release agent task.  We don't bother to wait because the caller of
 * this routine has no use for the exit status of the release agent
 * task, so no sense holding our caller up for that.
 */
774
void cgroup1_release_agent(struct work_struct *work)
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
{
	struct cgroup *cgrp =
		container_of(work, struct cgroup, release_agent_work);
	char *pathbuf = NULL, *agentbuf = NULL;
	char *argv[3], *envp[3];
	int ret;

	mutex_lock(&cgroup_mutex);

	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
	if (!pathbuf || !agentbuf)
		goto out;

	spin_lock_irq(&css_set_lock);
	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
	spin_unlock_irq(&css_set_lock);
	if (ret < 0 || ret >= PATH_MAX)
		goto out;

	argv[0] = agentbuf;
	argv[1] = pathbuf;
	argv[2] = NULL;

	/* minimal command environment */
	envp[0] = "HOME=/";
	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
	envp[2] = NULL;

	mutex_unlock(&cgroup_mutex);
	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
	goto out_free;
out:
	mutex_unlock(&cgroup_mutex);
out_free:
	kfree(agentbuf);
	kfree(pathbuf);
}

/*
 * cgroup_rename - Only allow simple rename of directories in place.
 */
817
818
static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
			  const char *new_name_str)
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
{
	struct cgroup *cgrp = kn->priv;
	int ret;

	if (kernfs_type(kn) != KERNFS_DIR)
		return -ENOTDIR;
	if (kn->parent != new_parent)
		return -EIO;

	/*
	 * We're gonna grab cgroup_mutex which nests outside kernfs
	 * active_ref.  kernfs_rename() doesn't require active_ref
	 * protection.  Break them before grabbing cgroup_mutex.
	 */
	kernfs_break_active_protection(new_parent);
	kernfs_break_active_protection(kn);

	mutex_lock(&cgroup_mutex);

	ret = kernfs_rename(kn, new_parent, new_name_str);
	if (!ret)
840
		TRACE_CGROUP_PATH(rename, cgrp);
841
842
843
844
845
846
847
848

	mutex_unlock(&cgroup_mutex);

	kernfs_unbreak_active_protection(kn);
	kernfs_unbreak_active_protection(new_parent);
	return ret;
}

849
850
851
852
853
854
855
856
857
858
859
860
861
static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
	struct cgroup_subsys *ss;
	int ssid;

	for_each_subsys(ss, ssid)
		if (root->subsys_mask & (1 << ssid))
			seq_show_option(seq, ss->legacy_name, NULL);
	if (root->flags & CGRP_ROOT_NOPREFIX)
		seq_puts(seq, ",noprefix");
	if (root->flags & CGRP_ROOT_XATTR)
		seq_puts(seq, ",xattr");
862
863
	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
		seq_puts(seq, ",cpuset_v2_mode");
864
865
866
867
868
869
870
871
872
873
874
875
876
877

	spin_lock(&release_agent_path_lock);
	if (strlen(root->release_agent_path))
		seq_show_option(seq, "release_agent",
				root->release_agent_path);
	spin_unlock(&release_agent_path_lock);

	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
		seq_puts(seq, ",clone_children");
	if (strlen(root->name))
		seq_show_option(seq, "name", root->name);
	return 0;
}

878
879
880
881
882
883
884
885
886
887
enum cgroup1_param {
	Opt_all,
	Opt_clone_children,
	Opt_cpuset_v2_mode,
	Opt_name,
	Opt_none,
	Opt_noprefix,
	Opt_release_agent,
	Opt_xattr,
};
888

889
890
891
892
893
894
895
896
897
898
899
static const struct fs_parameter_spec cgroup1_param_specs[] = {
	fsparam_flag  ("all",		Opt_all),
	fsparam_flag  ("clone_children", Opt_clone_children),
	fsparam_flag  ("cpuset_v2_mode", Opt_cpuset_v2_mode),
	fsparam_string("name",		Opt_name),
	fsparam_flag  ("none",		Opt_none),
	fsparam_flag  ("noprefix",	Opt_noprefix),
	fsparam_string("release_agent",	Opt_release_agent),
	fsparam_flag  ("xattr",		Opt_xattr),
	{}
};
900

901
902
903
904
const struct fs_parameter_description cgroup1_fs_parameters = {
	.name		= "cgroup1",
	.specs		= cgroup1_param_specs,
};
905

906
907
908
909
910
911
912
913
914
915
916
917
918
919
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	struct cgroup_subsys *ss;
	struct fs_parse_result result;
	int opt, i;

	opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
	if (opt == -ENOPARAM) {
		if (strcmp(param->key, "source") == 0) {
			fc->source = param->string;
			param->string = NULL;
			return 0;
		}
920
		for_each_subsys(ss, i) {
921
			if (strcmp(param->key, ss->legacy_name))
922
				continue;
923
			ctx->subsys_mask |= (1 << i);
924
			return 0;
925
		}
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
		return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
	}
	if (opt < 0)
		return opt;

	switch (opt) {
	case Opt_none:
		/* Explicitly have no subsystems */
		ctx->none = true;
		break;
	case Opt_all:
		ctx->all_ss = true;
		break;
	case Opt_noprefix:
		ctx->flags |= CGRP_ROOT_NOPREFIX;
		break;
	case Opt_clone_children:
		ctx->cpuset_clone_children = true;
		break;
	case Opt_cpuset_v2_mode:
		ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
		break;
	case Opt_xattr:
		ctx->flags |= CGRP_ROOT_XATTR;
		break;
	case Opt_release_agent:
		/* Specifying two release agents is forbidden */
		if (ctx->release_agent)
			return cg_invalf(fc, "cgroup1: release_agent respecified");
		ctx->release_agent = param->string;
		param->string = NULL;
		break;
	case Opt_name:
		/* blocked by boot param? */
		if (cgroup_no_v1_named)
961
			return -ENOENT;
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
		/* Can't specify an empty name */
		if (!param->size)
			return cg_invalf(fc, "cgroup1: Empty name");
		if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
			return cg_invalf(fc, "cgroup1: Name too long");
		/* Must match [\w.-]+ */
		for (i = 0; i < param->size; i++) {
			char c = param->string[i];
			if (isalnum(c))
				continue;
			if ((c == '.') || (c == '-') || (c == '_'))
				continue;
			return cg_invalf(fc, "cgroup1: Invalid name");
		}
		/* Specifying two names is forbidden */
		if (ctx->name)
			return cg_invalf(fc, "cgroup1: name respecified");
		ctx->name = param->string;
		param->string = NULL;
		break;
982
	}
983
984
985
	return 0;
}

986
static int check_cgroupfs_options(struct fs_context *fc)
987
{
988
	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
989
990
991
992
993
994
995
996
997
998
999
1000
1001
	u16 mask = U16_MAX;
	u16 enabled = 0;
	struct cgroup_subsys *ss;
	int i;

#ifdef CONFIG_CPUSETS
	mask = ~((u16)1 << cpuset_cgrp_id);
#endif
	for_each_subsys(ss, i)
		if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
			enabled |= 1 << i;

	ctx->subsys_mask &= enabled;
1002
1003

	/*
1004
1005
	 * In absense of 'none', 'name=' or subsystem name options,
	 * let's default to 'all'.
1006
	 */
1007
1008
1009
1010
1011
1012
	if (!ctx->subsys_mask && !ctx->none && !ctx->name)
		ctx->all_ss = true;

	if (ctx->all_ss) {
		/* Mutually exclusive option 'all' + subsystem name */
		if (ctx->subsys_mask)
1013
			return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
1014
1015
1016
		/* 'all' => select all the subsystems */
		ctx->subsys_mask = enabled;
	}
1017
1018
1019
1020
1021

	/*
	 * We either have to specify by name or by subsystems. (So all
	 * empty hierarchies must have a name).
	 */
1022
	if (!ctx->subsys_mask && !ctx->name)
1023
		return cg_invalf(fc, "cgroup1: Need name or subsystem set");
1024
1025
1026
1027
1028
1029

	/*
	 * Option noprefix was introduced just for backward compatibility
	 * with the old cpuset, so we allow noprefix only if mounting just
	 * the cpuset subsystem.
	 */
1030
	if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
1031
		return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
1032
1033

	/* Can't specify "none" and some subsystems */
1034
	if (ctx->subsys_mask && ctx->none)
1035
		return cg_invalf(fc, "cgroup1: none used incorrectly");
1036
1037
1038
1039

	return 0;
}

1040
int cgroup1_reconfigure(struct fs_context *fc)
1041
{
1042
1043
	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
1044
	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1045
	int ret = 0;
1046
1047
1048
1049
1050
	u16 added_mask, removed_mask;

	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);

	/* See what subsystems are wanted */
1051
	ret = check_cgroupfs_options(fc);
1052
1053
1054
	if (ret)
		goto out_unlock;

1055
	if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
1056
1057
1058
		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
			task_tgid_nr(current), current->comm);

1059
1060
	added_mask = ctx->subsys_mask & ~root->subsys_mask;
	removed_mask = root->subsys_mask & ~ctx->subsys_mask;
1061
1062

	/* Don't allow flags or name to change at remount */
1063
1064
	if ((ctx->flags ^ root->flags) ||
	    (ctx->name && strcmp(ctx->name, root->name))) {
1065
		cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
1066
		       ctx->flags, ctx->name ?: "", root->flags, root->name);
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
		ret = -EINVAL;
		goto out_unlock;
	}

	/* remounting is not allowed for populated hierarchies */
	if (!list_empty(&root->cgrp.self.children)) {
		ret = -EBUSY;
		goto out_unlock;
	}

	ret = rebind_subsystems(root, added_mask);
	if (ret)
		goto out_unlock;

	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));

1083
	if (ctx->release_agent) {
1084
		spin_lock(&release_agent_path_lock);
1085
		strcpy(root->release_agent_path, ctx->release_agent);
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
		spin_unlock(&release_agent_path_lock);
	}

	trace_cgroup_remount(root);

 out_unlock:
	mutex_unlock(&cgroup_mutex);
	return ret;
}

struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
	.rename			= cgroup1_rename,
	.show_options		= cgroup1_show_options,
	.mkdir			= cgroup_mkdir,
	.rmdir			= cgroup_rmdir,
	.show_path		= cgroup_show_path,
};

1104
1105
1106
1107
1108
1109
1110
1111
1112
/*
 * The guts of cgroup1 mount - find or create cgroup_root to use.
 * Called with cgroup_mutex held; returns 0 on success, -E... on
 * error and positive - in case when the candidate is busy dying.
 * On success it stashes a reference to cgroup_root into given
 * cgroup_fs_context; that reference is *NOT* counting towards the
 * cgroup_root refcount.
 */
static int cgroup1_root_to_use(struct fs_context *fc)
1113
{
1114
	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1115
1116
1117
1118
1119
	struct cgroup_root *root;
	struct cgroup_subsys *ss;
	int i, ret;

	/* First find the desired set of subsystems */
1120
	ret = check_cgroupfs_options(fc);
1121
	if (ret)
1122
		return ret;
1123
1124
1125
1126
1127
1128
1129
1130
1131

	/*
	 * Destruction of cgroup root is asynchronous, so subsystems may
	 * still be dying after the previous unmount.  Let's drain the
	 * dying subsystems.  We just need to ensure that the ones
	 * unmounted previously finish dying and don't care about new ones
	 * starting.  Testing ref liveliness is good enough.
	 */
	for_each_subsys(ss, i) {
1132
		if (!(ctx->subsys_mask & (1 << i)) ||
1133
1134
1135
		    ss->root == &cgrp_dfl_root)
			continue;

1136
1137
		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
			return 1;	/* restart */
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
		cgroup_put(&ss->root->cgrp);
	}

	for_each_root(root) {
		bool name_match = false;

		if (root == &cgrp_dfl_root)
			continue;

		/*
		 * If we asked for a name then it must match.  Also, if
		 * name matches but sybsys_mask doesn't, we should fail.
		 * Remember whether name matched.
		 */
1152
1153
		if (ctx->name) {
			if (strcmp(ctx->name, root->name))
1154
1155
1156
1157
1158
1159
1160
1161
				continue;
			name_match = true;
		}

		/*
		 * If we asked for subsystems (or explicitly for no
		 * subsystems) then they must match.
		 */
1162
1163
		if ((ctx->subsys_mask || ctx->none) &&
		    (ctx->subsys_mask != root->subsys_mask)) {
1164
1165
			if (!name_match)
				continue;
1166
			return -EBUSY;
1167
1168
		}

1169
		if (root->flags ^ ctx->flags)
1170
1171
			pr_warn("new mount options do not match the existing superblock, will be ignored\n");

1172
		ctx->root = root;
1173
		return 0;
1174
1175
1176
1177
1178
1179
1180
	}

	/*
	 * No such thing, create a new one.  name= matching without subsys
	 * specification is allowed for already existing hierarchies but we
	 * can't create new one without subsys specification.
	 */
1181
1182
	if (!ctx->subsys_mask && !ctx->none)
		return cg_invalf(fc, "cgroup1: No subsys list or none specified");
1183
1184

	/* Hierarchies may only be created in the initial cgroup namespace. */
1185
	if (ctx->ns != &init_cgroup_ns)
1186
		return -EPERM;
1187
1188

	root = kzalloc(sizeof(*root), GFP_KERNEL);
1189
1190
	if (!root)
		return -ENOMEM;
1191

1192
1193
	ctx->root = root;
	init_cgroup_root(ctx);
1194

1195
	ret = cgroup_setup_root(root, ctx->subsys_mask);
1196
1197
	if (ret)
		cgroup_free_root(root);
1198
1199
1200
1201
1202
1203
1204
1205
1206
	return ret;
}

int cgroup1_get_tree(struct fs_context *fc)
{
	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	int ret;

	/* Check if the caller has permission to mount. */
1207
	if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
1208
1209
1210
1211
1212
1213
1214
		return -EPERM;

	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);

	ret = cgroup1_root_to_use(fc);
	if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
		ret = 1;	/* restart */
1215
1216
1217

	mutex_unlock(&cgroup_mutex);

1218
	if (!ret)
1219
		ret = cgroup_do_get_tree(fc);
1220
1221

	if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
1222
1223
		struct super_block *sb = fc->root->d_sb;
		dput(fc->root);
1224
		deactivate_locked_super(sb);
1225
1226
1227
1228
		ret = 1;
	}

	if (unlikely(ret > 0)) {
1229
		msleep(10);
1230
		return restart_syscall();
1231
	}
1232
	return ret;
1233
1234
}

1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
static int __init cgroup1_wq_init(void)
{
	/*
	 * Used to destroy pidlists and separate to serve as flush domain.
	 * Cap @max_active to 1 too.
	 */
	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
						    0, 1);
	BUG_ON(!cgroup_pidlist_destroy_wq);
	return 0;
}
core_initcall(cgroup1_wq_init);

static int __init cgroup_no_v1(char *str)
{
	struct cgroup_subsys *ss;
	char *token;
	int i;

	while ((token = strsep(&str, ",")) != NULL) {
		if (!*token)
			continue;

		if (!strcmp(token, "all")) {
			cgroup_no_v1_mask = U16_MAX;
1260
1261
1262
1263
1264
1265
			continue;
		}

		if (!strcmp(token, "named")) {
			cgroup_no_v1_named = true;
			continue;
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
		}

		for_each_subsys(ss, i) {
			if (strcmp(token, ss->name) &&
			    strcmp(token, ss->legacy_name))
				continue;

			cgroup_no_v1_mask |= 1 << i;
		}
	}
	return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);