fork.c 74.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2
3
4
5
6
7
8
9
10
11
12
13
14
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

Christian Brauner's avatar
Christian Brauner committed
15
#include <linux/anon_inodes.h>
Linus Torvalds's avatar
Linus Torvalds committed
16
#include <linux/slab.h>
17
#include <linux/sched/autogroup.h>
18
#include <linux/sched/mm.h>
19
#include <linux/sched/coredump.h>
20
#include <linux/sched/user.h>
21
#include <linux/sched/numa_balancing.h>
22
#include <linux/sched/stat.h>
23
#include <linux/sched/task.h>
24
#include <linux/sched/task_stack.h>
25
#include <linux/sched/cputime.h>
Christian Brauner's avatar
Christian Brauner committed
26
#include <linux/seq_file.h>
27
#include <linux/rtmutex.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
29
30
31
32
33
34
35
36
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
Al Viro's avatar
Al Viro committed
37
#include <linux/fdtable.h>
38
#include <linux/iocontext.h>
Linus Torvalds's avatar
Linus Torvalds committed
39
40
41
#include <linux/key.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
42
#include <linux/mmu_notifier.h>
Linus Torvalds's avatar
Linus Torvalds committed
43
#include <linux/fs.h>
Davidlohr Bueso's avatar
Davidlohr Bueso committed
44
45
#include <linux/mm.h>
#include <linux/vmacache.h>
46
#include <linux/nsproxy.h>
47
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
48
#include <linux/cpu.h>
49
#include <linux/cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
50
#include <linux/security.h>
51
#include <linux/hugetlb.h>
52
#include <linux/seccomp.h>
Linus Torvalds's avatar
Linus Torvalds committed
53
54
55
56
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
57
#include <linux/compat.h>
58
#include <linux/kthread.h>
59
#include <linux/task_io_accounting_ops.h>
60
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
61
62
63
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
64
#include <linux/memcontrol.h>
65
#include <linux/ftrace.h>
66
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
67
68
#include <linux/profile.h>
#include <linux/rmap.h>
Hugh Dickins's avatar
Hugh Dickins committed
69
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
70
#include <linux/acct.h>
71
#include <linux/userfaultfd_k.h>
72
#include <linux/tsacct_kern.h>
73
#include <linux/cn_proc.h>
Rafael J. Wysocki's avatar
Rafael J. Wysocki committed
74
#include <linux/freezer.h>
75
#include <linux/delayacct.h>
76
#include <linux/taskstats_kern.h>
77
#include <linux/random.h>
Miloslav Trmac's avatar
Miloslav Trmac committed
78
#include <linux/tty.h>
79
#include <linux/blkdev.h>
80
#include <linux/fs_struct.h>
81
#include <linux/magic.h>
82
#include <linux/perf_event.h>
83
#include <linux/posix-timers.h>
84
#include <linux/user-return-notifier.h>
Ying Han's avatar
Ying Han committed
85
#include <linux/oom.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
86
#include <linux/khugepaged.h>
87
#include <linux/signalfd.h>
88
#include <linux/uprobes.h>
89
#include <linux/aio.h>
90
#include <linux/compiler.h>
91
#include <linux/sysctl.h>
Dmitry Vyukov's avatar
Dmitry Vyukov committed
92
#include <linux/kcov.h>
93
#include <linux/livepatch.h>
94
#include <linux/thread_info.h>
95
#include <linux/stackleak.h>
96
#include <linux/kasan.h>
97
#include <linux/scs.h>
98
#include <linux/io_uring.h>
Linus Torvalds's avatar
Linus Torvalds committed
99
100

#include <asm/pgalloc.h>
101
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
102
103
104
105
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

106
107
#include <trace/events/sched.h>

108
109
110
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

111
112
113
114
115
116
117
118
119
120
/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

Linus Torvalds's avatar
Linus Torvalds committed
121
122
123
124
/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;	/* Handle normal Linux uptimes. */
125
int nr_threads;			/* The idle threads do not count.. */
Linus Torvalds's avatar
Linus Torvalds committed
126

127
static int max_threads;		/* tunable limit on nr_threads */
Linus Torvalds's avatar
Linus Torvalds committed
128

129
130
131
132
133
134
135
136
137
#define NAMED_ARRAY_INDEX(x)	[x] = __stringify(x)

static const char * const resident_page_types[] = {
	NAMED_ARRAY_INDEX(MM_FILEPAGES),
	NAMED_ARRAY_INDEX(MM_ANONPAGES),
	NAMED_ARRAY_INDEX(MM_SWAPENTS),
	NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

Linus Torvalds's avatar
Linus Torvalds committed
138
139
DEFINE_PER_CPU(unsigned long, process_counts) = 0;

140
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
141
142
143
144
145
146
147
148

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
	return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */
Linus Torvalds's avatar
Linus Torvalds committed
149
150
151
152
153
154

int nr_processes(void)
{
	int cpu;
	int total = 0;

155
	for_each_possible_cpu(cpu)
Linus Torvalds's avatar
Linus Torvalds committed
156
157
158
159
160
		total += per_cpu(process_counts, cpu);

	return total;
}

161
162
163
164
void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

165
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
166
static struct kmem_cache *task_struct_cachep;
167
168
169
170
171
172
173
174
175
176

static inline struct task_struct *alloc_task_struct_node(int node)
{
	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
	kmem_cache_free(task_struct_cachep, tsk);
}
Linus Torvalds's avatar
Linus Torvalds committed
177
178
#endif

179
#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
180

181
182
183
184
/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
185
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
186
187
188
189
190
191
192
193

#ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211

static int free_vm_stack_cache(unsigned int cpu)
{
	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
		struct vm_struct *vm_stack = cached_vm_stacks[i];

		if (!vm_stack)
			continue;

		vfree(vm_stack->addr);
		cached_vm_stacks[i] = NULL;
	}

	return 0;
}
212
213
#endif

214
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
215
{
216
#ifdef CONFIG_VMAP_STACK
217
218
219
220
	void *stack;
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
221
222
223
		struct vm_struct *s;

		s = this_cpu_xchg(cached_stacks[i], NULL);
224
225
226
227

		if (!s)
			continue;

228
229
		/* Mark stack accessible for KASAN. */
		kasan_unpoison_range(s->addr, THREAD_SIZE);
230

231
232
		/* Clear stale pointers from reused stack. */
		memset(s->addr, 0, THREAD_SIZE);
233

234
		tsk->stack_vm_area = s;
235
		tsk->stack = s->addr;
236
237
238
		return s->addr;
	}

239
240
241
242
243
	/*
	 * Allocated stacks are cached and later reused by new threads,
	 * so memcg accounting is performed manually on assigning/releasing
	 * stacks to tasks. Drop __GFP_ACCOUNT.
	 */
244
	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
245
				     VMALLOC_START, VMALLOC_END,
246
				     THREADINFO_GFP & ~__GFP_ACCOUNT,
247
248
				     PAGE_KERNEL,
				     0, node, __builtin_return_address(0));
249
250
251
252
253
254

	/*
	 * We can't call find_vm_area() in interrupt context, and
	 * free_thread_stack() can be called in interrupt context,
	 * so cache the vm_struct.
	 */
255
	if (stack) {
256
		tsk->stack_vm_area = find_vm_area(stack);
257
258
		tsk->stack = stack;
	}
259
260
	return stack;
#else
261
262
	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
					     THREAD_SIZE_ORDER);
263

264
	if (likely(page)) {
265
		tsk->stack = kasan_reset_tag(page_address(page));
266
267
268
		return tsk->stack;
	}
	return NULL;
269
#endif
270
271
}

272
static inline void free_thread_stack(struct task_struct *tsk)
273
{
274
#ifdef CONFIG_VMAP_STACK
275
276
277
	struct vm_struct *vm = task_stack_vm_area(tsk);

	if (vm) {
278
279
		int i;

280
		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
281
			memcg_kmem_uncharge_page(vm->pages[i], 0);
282

283
		for (i = 0; i < NR_CACHED_STACKS; i++) {
284
285
			if (this_cpu_cmpxchg(cached_stacks[i],
					NULL, tsk->stack_vm_area) != NULL)
286
287
288
289
290
				continue;

			return;
		}

291
		vfree_atomic(tsk->stack);
292
293
294
295
296
		return;
	}
#endif

	__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
297
}
298
# else
299
static struct kmem_cache *thread_stack_cache;
300

301
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
302
303
						  int node)
{
304
305
	unsigned long *stack;
	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
306
	stack = kasan_reset_tag(stack);
307
308
	tsk->stack = stack;
	return stack;
309
310
}

311
static void free_thread_stack(struct task_struct *tsk)
312
{
313
	kmem_cache_free(thread_stack_cache, tsk->stack);
314
315
}

316
void thread_stack_cache_init(void)
317
{
318
319
320
	thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
					THREAD_SIZE, THREAD_SIZE, 0, 0,
					THREAD_SIZE, NULL);
321
	BUG_ON(thread_stack_cache == NULL);
322
323
}
# endif
324
325
#endif

Linus Torvalds's avatar
Linus Torvalds committed
326
/* SLAB cache for signal_struct structures (tsk->signal) */
327
static struct kmem_cache *signal_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
328
329

/* SLAB cache for sighand_struct structures (tsk->sighand) */
330
struct kmem_cache *sighand_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
331
332

/* SLAB cache for files_struct structures (tsk->files) */
333
struct kmem_cache *files_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
334
335

/* SLAB cache for fs_struct structures (tsk->fs) */
336
struct kmem_cache *fs_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
337
338

/* SLAB cache for vm_area_struct structures */
339
static struct kmem_cache *vm_area_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
340
341

/* SLAB cache for mm_struct structures (tsk->mm) */
342
static struct kmem_cache *mm_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
343

344
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
345
{
346
	struct vm_area_struct *vma;
347

348
	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
349
350
	if (vma)
		vma_init(vma, mm);
351
	return vma;
352
353
354
355
}

struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
356
357
358
	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

	if (new) {
359
360
361
362
363
364
365
		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
		/*
		 * orig->shared.rb may be modified concurrently, but the clone
		 * will be reinitialized.
		 */
		*new = data_race(*orig);
366
		INIT_LIST_HEAD(&new->anon_vma_chain);
367
		new->vm_next = new->vm_prev = NULL;
368
369
	}
	return new;
370
371
372
373
374
375
376
}

void vm_area_free(struct vm_area_struct *vma)
{
	kmem_cache_free(vm_area_cachep, vma);
}

377
static void account_kernel_stack(struct task_struct *tsk, int account)
378
{
379
380
381
382
	void *stack = task_stack_page(tsk);
	struct vm_struct *vm = task_stack_vm_area(tsk);


383
384
385
386
387
	/* All stack pages are in the same node. */
	if (vm)
		mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
				      account * (THREAD_SIZE / 1024));
	else
388
		mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
389
				      account * (THREAD_SIZE / 1024));
390
391
}

392
393
394
395
396
397
static int memcg_charge_kernel_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
	struct vm_struct *vm = task_stack_vm_area(tsk);
	int ret;

398
399
	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);

400
401
402
	if (vm) {
		int i;

403
404
		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

405
406
		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
			/*
407
408
409
410
			 * If memcg_kmem_charge_page() fails, page's
			 * memory cgroup pointer is NULL, and
			 * memcg_kmem_uncharge_page() in free_thread_stack()
			 * will ignore this page.
411
			 */
412
413
			ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
						     0);
414
415
416
417
418
419
420
421
			if (ret)
				return ret;
		}
	}
#endif
	return 0;
}

422
static void release_task_stack(struct task_struct *tsk)
Linus Torvalds's avatar
Linus Torvalds committed
423
{
424
425
426
	if (WARN_ON(tsk->state != TASK_DEAD))
		return;  /* Better to leak the stack than to free prematurely */

427
428
	account_kernel_stack(tsk, -1);
	free_thread_stack(tsk);
429
430
431
432
433
434
435
436
437
	tsk->stack = NULL;
#ifdef CONFIG_VMAP_STACK
	tsk->stack_vm_area = NULL;
#endif
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
438
	if (refcount_dec_and_test(&tsk->stack_refcount))
439
440
441
442
443
444
		release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
445
446
	scs_release(tsk);

447
448
449
450
451
452
453
454
455
456
457
#ifndef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * The task is finally done with both the stack and thread_info,
	 * so free both.
	 */
	release_task_stack(tsk);
#else
	/*
	 * If the task had a separate stack allocation, it should be gone
	 * by now.
	 */
458
	WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
459
#endif
Ingo Molnar's avatar
Ingo Molnar committed
460
	rt_mutex_debug_task_free(tsk);
461
	ftrace_graph_exit_task(tsk);
462
	arch_release_task_struct(tsk);
463
464
	if (tsk->flags & PF_KTHREAD)
		free_kthread_struct(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
465
466
467
468
	free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

469
470
471
472
473
474
475
476
477
478
479
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
					struct mm_struct *oldmm)
{
	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
	struct rb_node **rb_link, *rb_parent;
	int retval;
	unsigned long charge;
	LIST_HEAD(uf);

	uprobe_start_dup_mmap();
480
	if (mmap_write_lock_killable(oldmm)) {
481
482
483
484
485
486
487
488
		retval = -EINTR;
		goto fail_uprobe_end;
	}
	flush_cache_dup_mm(oldmm);
	uprobe_dup_mmap(oldmm, mm);
	/*
	 * Not linked in yet - no deadlock potential:
	 */
489
	mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517

	/* No ordering required: file already has been exposed. */
	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));

	mm->total_vm = oldmm->total_vm;
	mm->data_vm = oldmm->data_vm;
	mm->exec_vm = oldmm->exec_vm;
	mm->stack_vm = oldmm->stack_vm;

	rb_link = &mm->mm_rb.rb_node;
	rb_parent = NULL;
	pprev = &mm->mmap;
	retval = ksm_fork(mm, oldmm);
	if (retval)
		goto out;
	retval = khugepaged_fork(mm, oldmm);
	if (retval)
		goto out;

	prev = NULL;
	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
		struct file *file;

		if (mpnt->vm_flags & VM_DONTCOPY) {
			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
			continue;
		}
		charge = 0;
518
519
520
521
522
523
524
525
		/*
		 * Don't duplicate many vmas if we've been oom-killed (for
		 * example)
		 */
		if (fatal_signal_pending(current)) {
			retval = -EINTR;
			goto out;
		}
526
527
528
529
530
531
532
		if (mpnt->vm_flags & VM_ACCOUNT) {
			unsigned long len = vma_pages(mpnt);

			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
				goto fail_nomem;
			charge = len;
		}
533
		tmp = vm_area_dup(mpnt);
534
535
536
537
538
539
540
541
542
543
		if (!tmp)
			goto fail_nomem;
		retval = vma_dup_policy(mpnt, tmp);
		if (retval)
			goto fail_nomem_policy;
		tmp->vm_mm = mm;
		retval = dup_userfaultfd(tmp, &uf);
		if (retval)
			goto fail_nomem_anon_vma_fork;
		if (tmp->vm_flags & VM_WIPEONFORK) {
544
545
546
547
548
			/*
			 * VM_WIPEONFORK gets a clean slate in the child.
			 * Don't prepare anon_vma until fault since we don't
			 * copy page for current vma.
			 */
549
550
551
552
553
554
555
556
557
558
559
			tmp->anon_vma = NULL;
		} else if (anon_vma_fork(tmp, mpnt))
			goto fail_nomem_anon_vma_fork;
		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
		file = tmp->vm_file;
		if (file) {
			struct inode *inode = file_inode(file);
			struct address_space *mapping = file->f_mapping;

			get_file(file);
			if (tmp->vm_flags & VM_DENYWRITE)
560
				put_write_access(inode);
561
562
			i_mmap_lock_write(mapping);
			if (tmp->vm_flags & VM_SHARED)
563
				mapping_allow_writable(mapping);
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
			flush_dcache_mmap_lock(mapping);
			/* insert tmp into the share list, just after mpnt */
			vma_interval_tree_insert_after(tmp, mpnt,
					&mapping->i_mmap);
			flush_dcache_mmap_unlock(mapping);
			i_mmap_unlock_write(mapping);
		}

		/*
		 * Clear hugetlb-related page reserves for children. This only
		 * affects MAP_PRIVATE mappings. Faults generated by the child
		 * are not guaranteed to succeed, even if read-only
		 */
		if (is_vm_hugetlb_page(tmp))
			reset_vma_resv_huge_pages(tmp);

		/*
		 * Link in the new vma and copy the page table entries.
		 */
		*pprev = tmp;
		pprev = &tmp->vm_next;
		tmp->vm_prev = prev;
		prev = tmp;

		__vma_link_rb(mm, tmp, rb_link, rb_parent);
		rb_link = &tmp->vm_rb.rb_right;
		rb_parent = &tmp->vm_rb;

		mm->map_count++;
		if (!(tmp->vm_flags & VM_WIPEONFORK))
594
			retval = copy_page_range(tmp, mpnt);
595
596
597
598
599
600
601
602

		if (tmp->vm_ops && tmp->vm_ops->open)
			tmp->vm_ops->open(tmp);

		if (retval)
			goto out;
	}
	/* a new mm has just been created */
603
	retval = arch_dup_mmap(oldmm, mm);
604
out:
605
	mmap_write_unlock(mm);
606
	flush_tlb_mm(oldmm);
607
	mmap_write_unlock(oldmm);
608
609
610
611
612
613
614
	dup_userfaultfd_complete(&uf);
fail_uprobe_end:
	uprobe_end_dup_mmap();
	return retval;
fail_nomem_anon_vma_fork:
	mpol_put(vma_policy(tmp));
fail_nomem_policy:
615
	vm_area_free(tmp);
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
fail_nomem:
	retval = -ENOMEM;
	vm_unacct_memory(charge);
	goto out;
}

static inline int mm_alloc_pgd(struct mm_struct *mm)
{
	mm->pgd = pgd_alloc(mm);
	if (unlikely(!mm->pgd))
		return -ENOMEM;
	return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
	pgd_free(mm, mm->pgd);
}
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
637
	mmap_write_lock(oldmm);
638
	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
639
	mmap_write_unlock(oldmm);
640
641
642
643
644
645
646
647
648
649
	return 0;
}
#define mm_alloc_pgd(mm)	(0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

static void check_mm(struct mm_struct *mm)
{
	int i;

650
651
652
	BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
			 "Please make sure 'struct resident_page_types[]' is updated as well");

653
654
655
656
	for (i = 0; i < NR_MM_COUNTERS; i++) {
		long x = atomic_long_read(&mm->rss_stat.count[i]);

		if (unlikely(x))
657
658
			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
				 mm, resident_page_types[i], x);
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
	}

	if (mm_pgtables_bytes(mm))
		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
				mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
678
void __mmdrop(struct mm_struct *mm)
679
680
{
	BUG_ON(mm == &init_mm);
681
682
	WARN_ON_ONCE(mm == current->mm);
	WARN_ON_ONCE(mm == current->active_mm);
683
684
	mm_free_pgd(mm);
	destroy_context(mm);
685
	mmu_notifier_subscriptions_destroy(mm);
686
687
688
689
	check_mm(mm);
	put_user_ns(mm->user_ns);
	free_mm(mm);
}
690
EXPORT_SYMBOL_GPL(__mmdrop);
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707

static void mmdrop_async_fn(struct work_struct *work)
{
	struct mm_struct *mm;

	mm = container_of(work, struct mm_struct, async_put_work);
	__mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
		INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
		schedule_work(&mm->async_put_work);
	}
}

708
709
static inline void free_signal_struct(struct signal_struct *sig)
{
710
	taskstats_tgid_free(sig);
711
	sched_autogroup_exit(sig);
712
713
714
715
	/*
	 * __mmdrop is not safe to call from softirq context on x86 due to
	 * pgd_dtor so postpone it to the async context
	 */
716
	if (sig->oom_mm)
717
		mmdrop_async(sig->oom_mm);
718
719
720
721
722
	kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
723
	if (refcount_dec_and_test(&sig->sigcnt))
724
725
726
		free_signal_struct(sig);
}

727
void __put_task_struct(struct task_struct *tsk)
Linus Torvalds's avatar
Linus Torvalds committed
728
{
Eugene Teo's avatar
Eugene Teo committed
729
	WARN_ON(!tsk->exit_state);
730
	WARN_ON(refcount_read(&tsk->usage));
Linus Torvalds's avatar
Linus Torvalds committed
731
732
	WARN_ON(tsk == current);

733
	io_uring_free(tsk);
734
	cgroup_free(tsk);
735
	task_numa_free(tsk, true);
736
	security_task_free(tsk);
737
	exit_creds(tsk);
738
	delayacct_tsk_free(tsk);
739
	put_signal_struct(tsk->signal);
Linus Torvalds's avatar
Linus Torvalds committed
740
741
742
743

	if (!profile_handoff_task(tsk))
		free_task(tsk);
}
744
EXPORT_SYMBOL_GPL(__put_task_struct);
Linus Torvalds's avatar
Linus Torvalds committed
745

746
void __init __weak arch_task_cache_init(void) { }
747

748
749
750
/*
 * set_max_threads
 */
751
static void set_max_threads(unsigned int max_threads_suggested)
752
{
753
	u64 threads;
754
	unsigned long nr_pages = totalram_pages();
755
756

	/*
757
758
	 * The number of threads shall be limited such that the thread
	 * structures may only consume a small part of the available memory.
759
	 */
760
	if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
761
762
		threads = MAX_THREADS;
	else
763
		threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
764
765
				    (u64) THREAD_SIZE * 8UL);

766
767
768
	if (threads > max_threads_suggested)
		threads = max_threads_suggested;

769
	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
770
771
}

772
773
774
775
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif
776

777
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
778
779
780
781
782
783
784
785
786
787
788
789
790
791
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
	/* Fetch thread_struct whitelist for the architecture. */
	arch_thread_struct_whitelist(offset, size);

	/*
	 * Handle zero-sized whitelist or empty thread_struct, otherwise
	 * adjust offset to position of thread_struct in task_struct.
	 */
	if (unlikely(*size == 0))
		*offset = 0;
	else
		*offset += offsetof(struct task_struct, thread);
}
792
#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
793

794
void __init fork_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
795
{
796
	int i;
797
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
Linus Torvalds's avatar
Linus Torvalds committed
798
#ifndef ARCH_MIN_TASKALIGN
799
#define ARCH_MIN_TASKALIGN	0
Linus Torvalds's avatar
Linus Torvalds committed
800
#endif
801
	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
802
	unsigned long useroffset, usersize;
803

Linus Torvalds's avatar
Linus Torvalds committed
804
	/* create a slab on which task_structs can be allocated */
805
806
	task_struct_whitelist(&useroffset, &usersize);
	task_struct_cachep = kmem_cache_create_usercopy("task_struct",
807
			arch_task_struct_size, align,
808
809
			SLAB_PANIC|SLAB_ACCOUNT,
			useroffset, usersize, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
810
811
#endif

812
813
814
	/* do the arch specific task caches init */
	arch_task_cache_init();

815
	set_max_threads(MAX_THREADS);
Linus Torvalds's avatar
Linus Torvalds committed
816
817
818
819
820

	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
	init_task.signal->rlim[RLIMIT_SIGPENDING] =
		init_task.signal->rlim[RLIMIT_NPROC];
821

822
823
824
	for (i = 0; i < UCOUNT_COUNTS; i++) {
		init_user_ns.ucount_max[i] = max_threads/2;
	}
825
826
827
828
829

#ifdef CONFIG_VMAP_STACK
	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
			  NULL, free_vm_stack_cache);
#endif
830

831
832
	scs_init();

833
	lockdep_init_task(&init_task);
834
	uprobes_init();
Linus Torvalds's avatar
Linus Torvalds committed
835
836
}

837
int __weak arch_dup_task_struct(struct task_struct *dst,
838
839
840
841
842
843
					       struct task_struct *src)
{
	*dst = *src;
	return 0;
}

844
845
846
847
848
849
850
851
void set_task_stack_end_magic(struct task_struct *tsk)
{
	unsigned long *stackend;

	stackend = end_of_stack(tsk);
	*stackend = STACK_END_MAGIC;	/* for overflow detection */
}

852
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
Linus Torvalds's avatar
Linus Torvalds committed
853
854
{
	struct task_struct *tsk;
855
	unsigned long *stack;
856
	struct vm_struct *stack_vm_area __maybe_unused;
Peter Zijlstra's avatar
Peter Zijlstra committed
857
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
858

859
860
	if (node == NUMA_NO_NODE)
		node = tsk_fork_get_node(orig);
861
	tsk = alloc_task_struct_node(node);
Linus Torvalds's avatar
Linus Torvalds committed
862
863
864
	if (!tsk)
		return NULL;

865
866
	stack = alloc_thread_stack_node(tsk, node);
	if (!stack)
867
		goto free_tsk;
Linus Torvalds's avatar
Linus Torvalds committed
868

869
870
871
	if (memcg_charge_kernel_stack(tsk))
		goto free_stack;

872
873
	stack_vm_area = task_stack_vm_area(tsk);

874
	err = arch_dup_task_struct(tsk, orig);
875
876
877
878
879
880
881
882
883
884

	/*
	 * arch_dup_task_struct() clobbers the stack-related fields.  Make
	 * sure they're properly initialized before using any stack-related
	 * functions again.
	 */
	tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
	tsk->stack_vm_area = stack_vm_area;
#endif
885
#ifdef CONFIG_THREAD_INFO_IN_TASK
886
	refcount_set(&tsk->stack_refcount, 1);
887
#endif
888

889
	if (err)
890
		goto free_stack;
891

892
893
894
895
	err = scs_prepare(tsk, node);
	if (err)
		goto free_stack;

Kees Cook's avatar
Kees Cook committed
896
897
898
899
900
901
902
903
904
#ifdef CONFIG_SECCOMP
	/*
	 * We must handle setting up seccomp filters once we're under
	 * the sighand lock in case orig has changed between now and
	 * then. Until then, filter must be NULL to avoid messing up
	 * the usage counts on the error path calling free_task.
	 */
	tsk->seccomp.filter = NULL;
#endif
905
906

	setup_thread_stack(tsk, orig);
907
	clear_user_return_notifier(tsk);
908
	clear_tsk_need_resched(tsk);
909
	set_task_stack_end_magic(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
910

911
#ifdef CONFIG_STACKPROTECTOR
912
	tsk->stack_canary = get_random_canary();
913
#endif
914
915
	if (orig->cpus_ptr == &orig->cpus_mask)
		tsk->cpus_ptr = &tsk->cpus_mask;
916

917
	/*
918
919
	 * One for the user space visible state that goes away when reaped.
	 * One for the scheduler.
920
	 */
921
922
923
	refcount_set(&tsk->rcu_users, 2);
	/* One for the rcu users */
	refcount_set(&tsk->usage, 1);
924
#ifdef CONFIG_BLK_DEV_IO_TRACE
925
	tsk->btrace_seq = 0;
926
#endif
927
	tsk->splice_pipe = NULL;
928
	tsk->task_frag.page = NULL;
929
	tsk->wake_q.next = NULL;
930

931
	account_kernel_stack(tsk, 1);
932

Dmitry Vyukov's avatar
Dmitry Vyukov committed
933
934
	kcov_task_init(tsk);

935
936
937
938
#ifdef CONFIG_FAULT_INJECTION
	tsk->fail_nth = 0;
#endif

939
940
941
942
943
#ifdef CONFIG_BLK_CGROUP
	tsk->throttle_queue = NULL;
	tsk->use_memdelay = 0;
#endif

944
945
946
#ifdef CONFIG_MEMCG
	tsk->active_memcg = NULL;
#endif
Linus Torvalds's avatar
Linus Torvalds committed
947
	return tsk;
948

949
free_stack:
950
	free_thread_stack(tsk);
951
free_tsk:
952
953
	free_task_struct(tsk);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
954
955
}

Daniel Walker's avatar
Daniel Walker committed
956
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
Linus Torvalds's avatar
Linus Torvalds committed
957

958
959
960
961
962
963
964
965
966
967
968
969
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
	default_dump_filter =
		(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
		MMF_DUMP_FILTER_MASK;
	return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

Linus Torvalds's avatar
Linus Torvalds committed
970
971
#include <linux/init_task.h>

972
973
974
975
static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
	spin_lock_init(&mm->ioctx_lock);
976
	mm->ioctx_table = NULL;
977
978
979
#endif
}

980
981
982
983
984
985
986
987
988
static __always_inline void mm_clear_owner(struct mm_struct *mm,
					   struct task_struct *p)
{
#ifdef CONFIG_MEMCG
	if (mm->owner == p)
		WRITE_ONCE(mm->owner, NULL);
#endif
}

989
990
991
992
993
994
995
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
	mm->owner = p;
#endif
}

996
997
998
999
1000
1001
1002
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
	mm->uprobes_state.xol_area = NULL;
#endif
}

1003
1004
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
	struct user_namespace *user_ns)
Linus Torvalds's avatar
Linus Torvalds committed
1005
{
1006
1007
1008
	mm->mmap = NULL;
	mm->mm_rb = RB_ROOT;
	mm->vmacache_seqnum = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1009
1010
	atomic_set(&mm->mm_users, 1);
	atomic_set(&mm->mm_count, 1);
1011
	seqcount_init(&mm->write_protect_seq);
1012
	mmap_init_lock(mm);
Linus Torvalds's avatar
Linus Torvalds committed
1013
	INIT_LIST_HEAD(&mm->mmlist);
1014
	mm->core_state = NULL;
1015
	mm_pgtables_bytes_init(mm);
1016
1017
	mm->map_count = 0;
	mm->locked_vm = 0;
Peter Xu's avatar
Peter Xu committed
1018
	atomic_set(&mm->has_pinned, 0);
1019
	atomic64_set(&mm->pinned_vm, 0);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1020
	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
Linus Torvalds's avatar
Linus Torvalds committed
1021
	spin_lock_init(&mm->page_table_lock);
1022
	spin_lock_init(&mm->arg_lock);
1023
	mm_init_cpumask(mm);
1024
	mm_init_aio(mm);
1025
	mm_init_owner(mm, p);
1026
	RCU_INIT_POINTER(mm->exe_file, NULL);
1027
	mmu_notifier_subscriptions_init(mm);
1028
	init_tlb_flush_pending(mm);
1029
1030
1031
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
	mm->pmd_huge_pte = NULL;
#endif
1032
	mm_init_uprobes_state(mm);
Linus Torvalds's avatar
Linus Torvalds committed
1033

1034
1035
1036
1037
1038
	if (current->mm) {
		mm->flags = current->mm->flags & MMF_INIT_MASK;
		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
	} else {
		mm->flags = default_dump_filter;
Linus Torvalds's avatar
Linus Torvalds committed
1039
		mm->def_flags = 0;
1040
1041
	}

1042
1043
1044
1045
1046
	if (mm_alloc_pgd(mm))
		goto fail_nopgd;

	if (init_new_context(p, mm))
		goto fail_nocontext;
1047

1048
	mm->user_ns = get_user_ns(user_ns);
1049
1050
1051
1052
1053
	return mm;

fail_nocontext:
	mm_free_pgd(mm);
fail_nopgd:
Linus Torvalds's avatar
Linus Torvalds committed
1054
1055
1056
1057
1058
1059
1060
	free_mm(mm);
	return NULL;
}

/*
 * Allocate and initialize an mm_struct.
 */
1061
struct mm_struct *mm_alloc(void)
Linus Torvalds's avatar
Linus Torvalds committed
1062
{
1063
	struct mm_struct *mm;
Linus Torvalds's avatar
Linus Torvalds committed
1064
1065

	mm = allocate_mm();