mmap.c 100 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2
3
4
5
6
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
Alan Cox's avatar
Alan Cox committed
7
 * Address space accounting code	<alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
8
9
 */

10
11
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

12
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
13
#include <linux/slab.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
14
#include <linux/backing-dev.h>
Linus Torvalds's avatar
Linus Torvalds committed
15
#include <linux/mm.h>
Davidlohr Bueso's avatar
Davidlohr Bueso committed
16
#include <linux/vmacache.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
18
19
20
21
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
22
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
24
25
26
27
28
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
29
#include <linux/shmem_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
30
#include <linux/profile.h>
31
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
32
33
34
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
35
#include <linux/mmu_notifier.h>
36
#include <linux/mmdebug.h>
37
#include <linux/perf_event.h>
Al Viro's avatar
Al Viro committed
38
#include <linux/audit.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
39
#include <linux/khugepaged.h>
40
#include <linux/uprobes.h>
41
#include <linux/rbtree_augmented.h>
42
43
#include <linux/notifier.h>
#include <linux/memory.h>
44
#include <linux/printk.h>
45
#include <linux/userfaultfd_k.h>
46
#include <linux/moduleparam.h>
47
#include <linux/pkeys.h>
48
#include <linux/oom.h>
49
#include <linux/sched/mm.h>
Linus Torvalds's avatar
Linus Torvalds committed
50

51
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
52
53
#include <asm/cacheflush.h>
#include <asm/tlb.h>
54
#include <asm/mmu_context.h>
Linus Torvalds's avatar
Linus Torvalds committed
55

56
57
#include "internal.h"

58
59
60
61
#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)	(0)
#endif

62
63
64
65
66
67
68
69
70
71
72
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif

73
static bool ignore_rlimit_data;
74
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
75

76
77
78
79
static void unmap_region(struct mm_struct *mm,
		struct vm_area_struct *vma, struct vm_area_struct *prev,
		unsigned long start, unsigned long end);

Linus Torvalds's avatar
Linus Torvalds committed
80
81
82
83
84
85
86
87
88
/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type	prot
 *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
 * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
 *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
vishnu.ps's avatar
vishnu.ps committed
89
 *
Linus Torvalds's avatar
Linus Torvalds committed
90
91
92
93
 * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
 *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
 *
94
95
96
97
98
 * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
 * MAP_PRIVATE:
 *								r: (no) no
 *								w: (no) no
 *								x: (yes) yes
Linus Torvalds's avatar
Linus Torvalds committed
99
 */
100
pgprot_t protection_map[16] __ro_after_init = {
Linus Torvalds's avatar
Linus Torvalds committed
101
102
103
104
	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};

105
106
107
108
109
110
111
#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
{
	return prot;
}
#endif

112
113
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
114
	pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
115
116
				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
			pgprot_val(arch_vm_get_page_prot(vm_flags)));
117
118

	return arch_filter_pgprot(ret);
119
120
121
}
EXPORT_SYMBOL(vm_get_page_prot);

122
123
124
125
126
127
128
129
130
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
	return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}

/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
	unsigned long vm_flags = vma->vm_flags;
131
	pgprot_t vm_page_prot;
132

133
134
	vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
	if (vma_wants_writenotify(vma, vm_page_prot)) {
135
		vm_flags &= ~VM_SHARED;
136
		vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
137
	}
138
139
	/* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
	WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
140
141
}

Linus Torvalds's avatar
Linus Torvalds committed
142
/*
143
 * Requires inode->i_mapping->i_mmap_rwsem
Linus Torvalds's avatar
Linus Torvalds committed
144
145
146
147
148
 */
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
		struct file *file, struct address_space *mapping)
{
	if (vma->vm_flags & VM_DENYWRITE)
Al Viro's avatar
Al Viro committed
149
		atomic_inc(&file_inode(file)->i_writecount);
Linus Torvalds's avatar
Linus Torvalds committed
150
	if (vma->vm_flags & VM_SHARED)
151
		mapping_unmap_writable(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
152
153

	flush_dcache_mmap_lock(mapping);
154
	vma_interval_tree_remove(vma, &mapping->i_mmap);
Linus Torvalds's avatar
Linus Torvalds committed
155
156
157
158
	flush_dcache_mmap_unlock(mapping);
}

/*
159
 * Unlink a file-based vm structure from its interval tree, to hide
160
 * vma from rmap and vmtruncate before freeing its page tables.
Linus Torvalds's avatar
Linus Torvalds committed
161
 */
162
void unlink_file_vma(struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
163
164
165
166
167
{
	struct file *file = vma->vm_file;

	if (file) {
		struct address_space *mapping = file->f_mapping;
168
		i_mmap_lock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
169
		__remove_shared_vm_struct(vma, file, mapping);
170
		i_mmap_unlock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
171
	}
172
173
174
175
176
177
178
179
180
181
}

/*
 * Close a vm structure and free it, returning the next.
 */
static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
{
	struct vm_area_struct *next = vma->vm_next;

	might_sleep();
Linus Torvalds's avatar
Linus Torvalds committed
182
183
	if (vma->vm_ops && vma->vm_ops->close)
		vma->vm_ops->close(vma);
184
	if (vma->vm_file)
185
		fput(vma->vm_file);
186
	mpol_put(vma_policy(vma));
187
	vm_area_free(vma);
188
	return next;
Linus Torvalds's avatar
Linus Torvalds committed
189
190
}

191
192
static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
		struct list_head *uf);
193
SYSCALL_DEFINE1(brk, unsigned long, brk)
Linus Torvalds's avatar
Linus Torvalds committed
194
{
195
	unsigned long retval;
196
	unsigned long newbrk, oldbrk, origbrk;
Linus Torvalds's avatar
Linus Torvalds committed
197
	struct mm_struct *mm = current->mm;
198
	struct vm_area_struct *next;
199
	unsigned long min_brk;
200
	bool populate;
201
	bool downgraded = false;
202
	LIST_HEAD(uf);
Linus Torvalds's avatar
Linus Torvalds committed
203

204
205
	if (down_write_killable(&mm->mmap_sem))
		return -EINTR;
Linus Torvalds's avatar
Linus Torvalds committed
206

207
208
	origbrk = mm->brk;

209
#ifdef CONFIG_COMPAT_BRK
210
211
212
213
214
	/*
	 * CONFIG_COMPAT_BRK can still be overridden by setting
	 * randomize_va_space to 2, which will still cause mm->start_brk
	 * to be arbitrarily shifted
	 */
215
	if (current->brk_randomized)
216
217
218
		min_brk = mm->start_brk;
	else
		min_brk = mm->end_data;
219
220
221
222
#else
	min_brk = mm->start_brk;
#endif
	if (brk < min_brk)
Linus Torvalds's avatar
Linus Torvalds committed
223
		goto out;
Ram Gupta's avatar
Ram Gupta committed
224
225
226
227
228
229
230

	/*
	 * Check against rlimit here. If this check is done later after the test
	 * of oldbrk with newbrk then it can escape the test and let the data
	 * segment grow beyond its set limit the in case where the limit is
	 * not page aligned -Ram Gupta
	 */
231
232
	if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
			      mm->end_data, mm->start_data))
Ram Gupta's avatar
Ram Gupta committed
233
234
		goto out;

Linus Torvalds's avatar
Linus Torvalds committed
235
236
	newbrk = PAGE_ALIGN(brk);
	oldbrk = PAGE_ALIGN(mm->brk);
237
238
239
240
	if (oldbrk == newbrk) {
		mm->brk = brk;
		goto success;
	}
Linus Torvalds's avatar
Linus Torvalds committed
241

242
243
244
245
	/*
	 * Always allow shrinking brk.
	 * __do_munmap() may downgrade mmap_sem to read.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
246
	if (brk <= mm->brk) {
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
		int ret;

		/*
		 * mm->brk must to be protected by write mmap_sem so update it
		 * before downgrading mmap_sem. When __do_munmap() fails,
		 * mm->brk will be restored from origbrk.
		 */
		mm->brk = brk;
		ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
		if (ret < 0) {
			mm->brk = origbrk;
			goto out;
		} else if (ret == 1) {
			downgraded = true;
		}
		goto success;
Linus Torvalds's avatar
Linus Torvalds committed
263
264
265
	}

	/* Check against existing mmap mappings. */
266
267
	next = find_vma(mm, oldbrk);
	if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
Linus Torvalds's avatar
Linus Torvalds committed
268
269
270
		goto out;

	/* Ok, looks good - let it rip. */
271
	if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
Linus Torvalds's avatar
Linus Torvalds committed
272
273
		goto out;
	mm->brk = brk;
274
275

success:
276
	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
277
278
279
280
	if (downgraded)
		up_read(&mm->mmap_sem);
	else
		up_write(&mm->mmap_sem);
281
	userfaultfd_unmap_complete(mm, &uf);
282
283
284
285
	if (populate)
		mm_populate(oldbrk, newbrk - oldbrk);
	return brk;

Linus Torvalds's avatar
Linus Torvalds committed
286
out:
287
	retval = origbrk;
Linus Torvalds's avatar
Linus Torvalds committed
288
289
290
291
	up_write(&mm->mmap_sem);
	return retval;
}

292
293
static long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
	unsigned long max, prev_end, subtree_gap;

	/*
	 * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
	 * allow two stack_guard_gaps between them here, and when choosing
	 * an unmapped area; whereas when expanding we only require one.
	 * That's a little inconsistent, but keeps the code here simpler.
	 */
	max = vm_start_gap(vma);
	if (vma->vm_prev) {
		prev_end = vm_end_gap(vma->vm_prev);
		if (max > prev_end)
			max -= prev_end;
		else
			max = 0;
	}
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
	if (vma->vm_rb.rb_left) {
		subtree_gap = rb_entry(vma->vm_rb.rb_left,
				struct vm_area_struct, vm_rb)->rb_subtree_gap;
		if (subtree_gap > max)
			max = subtree_gap;
	}
	if (vma->vm_rb.rb_right) {
		subtree_gap = rb_entry(vma->vm_rb.rb_right,
				struct vm_area_struct, vm_rb)->rb_subtree_gap;
		if (subtree_gap > max)
			max = subtree_gap;
	}
	return max;
}

325
#ifdef CONFIG_DEBUG_VM_RB
326
static int browse_rb(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
327
{
328
	struct rb_root *root = &mm->mm_rb;
329
	int i = 0, j, bug = 0;
Linus Torvalds's avatar
Linus Torvalds committed
330
331
332
333
334
335
	struct rb_node *nd, *pn = NULL;
	unsigned long prev = 0, pend = 0;

	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
		struct vm_area_struct *vma;
		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
336
		if (vma->vm_start < prev) {
337
338
			pr_emerg("vm_start %lx < prev %lx\n",
				  vma->vm_start, prev);
339
340
341
			bug = 1;
		}
		if (vma->vm_start < pend) {
342
343
			pr_emerg("vm_start %lx < pend %lx\n",
				  vma->vm_start, pend);
344
345
346
			bug = 1;
		}
		if (vma->vm_start > vma->vm_end) {
347
348
			pr_emerg("vm_start %lx > vm_end %lx\n",
				  vma->vm_start, vma->vm_end);
349
350
			bug = 1;
		}
351
		spin_lock(&mm->page_table_lock);
352
		if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
353
			pr_emerg("free gap %lx, correct %lx\n",
354
355
356
357
			       vma->rb_subtree_gap,
			       vma_compute_subtree_gap(vma));
			bug = 1;
		}
358
		spin_unlock(&mm->page_table_lock);
Linus Torvalds's avatar
Linus Torvalds committed
359
360
		i++;
		pn = nd;
David Miller's avatar
David Miller committed
361
362
		prev = vma->vm_start;
		pend = vma->vm_end;
Linus Torvalds's avatar
Linus Torvalds committed
363
364
	}
	j = 0;
365
	for (nd = pn; nd; nd = rb_prev(nd))
Linus Torvalds's avatar
Linus Torvalds committed
366
		j++;
367
	if (i != j) {
368
		pr_emerg("backwards %d, forwards %d\n", j, i);
369
		bug = 1;
Linus Torvalds's avatar
Linus Torvalds committed
370
	}
371
	return bug ? -1 : i;
Linus Torvalds's avatar
Linus Torvalds committed
372
373
}

374
375
376
377
378
379
380
static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
{
	struct rb_node *nd;

	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
		struct vm_area_struct *vma;
		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
381
382
383
		VM_BUG_ON_VMA(vma != ignore &&
			vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
			vma);
Linus Torvalds's avatar
Linus Torvalds committed
384
385
386
	}
}

387
static void validate_mm(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
388
389
390
{
	int bug = 0;
	int i = 0;
391
	unsigned long highest_address = 0;
392
	struct vm_area_struct *vma = mm->mmap;
393

394
	while (vma) {
395
		struct anon_vma *anon_vma = vma->anon_vma;
396
		struct anon_vma_chain *avc;
397

398
399
400
401
402
403
404
		if (anon_vma) {
			anon_vma_lock_read(anon_vma);
			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
				anon_vma_interval_tree_verify(avc);
			anon_vma_unlock_read(anon_vma);
		}

405
		highest_address = vm_end_gap(vma);
406
		vma = vma->vm_next;
Linus Torvalds's avatar
Linus Torvalds committed
407
408
		i++;
	}
409
	if (i != mm->map_count) {
410
		pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
411
412
413
		bug = 1;
	}
	if (highest_address != mm->highest_vm_end) {
414
		pr_emerg("mm->highest_vm_end %lx, found %lx\n",
415
			  mm->highest_vm_end, highest_address);
416
417
		bug = 1;
	}
418
	i = browse_rb(mm);
419
	if (i != mm->map_count) {
420
421
		if (i != -1)
			pr_emerg("map_count %d rb %d\n", mm->map_count, i);
422
423
		bug = 1;
	}
424
	VM_BUG_ON_MM(bug, mm);
Linus Torvalds's avatar
Linus Torvalds committed
425
426
}
#else
427
#define validate_mm_rb(root, ignore) do { } while (0)
Linus Torvalds's avatar
Linus Torvalds committed
428
429
430
#define validate_mm(mm) do { } while (0)
#endif

431
432
433
434
435
436
437
438
439
440
441
442
RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
		     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)

/*
 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
 * vma->vm_prev->vm_end values changed, without modifying the vma's position
 * in the rbtree.
 */
static void vma_gap_update(struct vm_area_struct *vma)
{
	/*
	 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
Wei Yang's avatar
Wei Yang committed
443
	 * function that does exactly what we want.
444
445
446
447
448
449
450
451
452
453
454
455
456
	 */
	vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}

static inline void vma_rb_insert(struct vm_area_struct *vma,
				 struct rb_root *root)
{
	/* All rb_subtree_gap values must be consistent prior to insertion */
	validate_mm_rb(root, NULL);

	rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}

457
static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
458
459
460
461
462
463
464
465
466
{
	/*
	 * Note rb_erase_augmented is a fairly large inline function,
	 * so make sure we instantiate it only once with our desired
	 * augmented rbtree callbacks.
	 */
	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}

467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
						struct rb_root *root,
						struct vm_area_struct *ignore)
{
	/*
	 * All rb_subtree_gap values must be consistent prior to erase,
	 * with the possible exception of the "next" vma being erased if
	 * next->vm_start was reduced.
	 */
	validate_mm_rb(root, ignore);

	__vma_rb_erase(vma, root);
}

static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
					 struct rb_root *root)
{
	/*
	 * All rb_subtree_gap values must be consistent prior to erase,
	 * with the possible exception of the vma being erased.
	 */
	validate_mm_rb(root, vma);

	__vma_rb_erase(vma, root);
}

493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
/*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
 *
 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 * vma must be removed from the anon_vma's interval trees using
 * anon_vma_interval_tree_pre_update_vma().
 *
 * After the update, the vma will be reinserted using
 * anon_vma_interval_tree_post_update_vma().
 *
 * The entire update must be protected by exclusive mmap_sem and by
 * the root anon_vma's mutex.
 */
static inline void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
	struct anon_vma_chain *avc;

	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static inline void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
	struct anon_vma_chain *avc;

	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

525
526
527
static int find_vma_links(struct mm_struct *mm, unsigned long addr,
		unsigned long end, struct vm_area_struct **pprev,
		struct rb_node ***rb_link, struct rb_node **rb_parent)
Linus Torvalds's avatar
Linus Torvalds committed
528
{
529
	struct rb_node **__rb_link, *__rb_parent, *rb_prev;
Linus Torvalds's avatar
Linus Torvalds committed
530
531
532
533
534
535
536
537
538
539
540

	__rb_link = &mm->mm_rb.rb_node;
	rb_prev = __rb_parent = NULL;

	while (*__rb_link) {
		struct vm_area_struct *vma_tmp;

		__rb_parent = *__rb_link;
		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);

		if (vma_tmp->vm_end > addr) {
541
542
543
			/* Fail if an existing vma overlaps the area */
			if (vma_tmp->vm_start < end)
				return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
544
545
546
547
548
549
550
551
552
553
554
555
			__rb_link = &__rb_parent->rb_left;
		} else {
			rb_prev = __rb_parent;
			__rb_link = &__rb_parent->rb_right;
		}
	}

	*pprev = NULL;
	if (rb_prev)
		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
	*rb_link = __rb_link;
	*rb_parent = __rb_parent;
556
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
557
558
}

559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
static unsigned long count_vma_pages_range(struct mm_struct *mm,
		unsigned long addr, unsigned long end)
{
	unsigned long nr_pages = 0;
	struct vm_area_struct *vma;

	/* Find first overlaping mapping */
	vma = find_vma_intersection(mm, addr, end);
	if (!vma)
		return 0;

	nr_pages = (min(end, vma->vm_end) -
		max(addr, vma->vm_start)) >> PAGE_SHIFT;

	/* Iterate over the rest of the overlaps */
	for (vma = vma->vm_next; vma; vma = vma->vm_next) {
		unsigned long overlap_len;

		if (vma->vm_start > end)
			break;

		overlap_len = min(end, vma->vm_end) - vma->vm_start;
		nr_pages += overlap_len >> PAGE_SHIFT;
	}

	return nr_pages;
}

Linus Torvalds's avatar
Linus Torvalds committed
587
588
589
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
		struct rb_node **rb_link, struct rb_node *rb_parent)
{
590
591
592
593
	/* Update tracking information for the gap following the new vma. */
	if (vma->vm_next)
		vma_gap_update(vma->vm_next);
	else
594
		mm->highest_vm_end = vm_end_gap(vma);
595
596
597
598
599
600
601
602
603
604

	/*
	 * vma->vm_prev wasn't known when we followed the rbtree to find the
	 * correct insertion point for that vma. As a result, we could not
	 * update the vma vm_rb parents rb_subtree_gap values on the way down.
	 * So, we first insert the vma with a zero rb_subtree_gap value
	 * (to be consistent with what we did on the way down), and then
	 * immediately update the gap to the correct value. Finally we
	 * rebalance the rbtree after all augmented values have been set.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
605
	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
606
607
608
	vma->rb_subtree_gap = 0;
	vma_gap_update(vma);
	vma_rb_insert(vma, &mm->mm_rb);
Linus Torvalds's avatar
Linus Torvalds committed
609
610
}

611
static void __vma_link_file(struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
612
{
ZhenwenXu's avatar
ZhenwenXu committed
613
	struct file *file;
Linus Torvalds's avatar
Linus Torvalds committed
614
615
616
617
618
619

	file = vma->vm_file;
	if (file) {
		struct address_space *mapping = file->f_mapping;

		if (vma->vm_flags & VM_DENYWRITE)
Al Viro's avatar
Al Viro committed
620
			atomic_dec(&file_inode(file)->i_writecount);
Linus Torvalds's avatar
Linus Torvalds committed
621
		if (vma->vm_flags & VM_SHARED)
622
			atomic_inc(&mapping->i_mmap_writable);
Linus Torvalds's avatar
Linus Torvalds committed
623
624

		flush_dcache_mmap_lock(mapping);
625
		vma_interval_tree_insert(vma, &mapping->i_mmap);
Linus Torvalds's avatar
Linus Torvalds committed
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
		flush_dcache_mmap_unlock(mapping);
	}
}

static void
__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
	struct vm_area_struct *prev, struct rb_node **rb_link,
	struct rb_node *rb_parent)
{
	__vma_link_list(mm, vma, prev, rb_parent);
	__vma_link_rb(mm, vma, rb_link, rb_parent);
}

static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
			struct vm_area_struct *prev, struct rb_node **rb_link,
			struct rb_node *rb_parent)
{
	struct address_space *mapping = NULL;

645
	if (vma->vm_file) {
Linus Torvalds's avatar
Linus Torvalds committed
646
		mapping = vma->vm_file->f_mapping;
647
		i_mmap_lock_write(mapping);
648
	}
Linus Torvalds's avatar
Linus Torvalds committed
649
650
651
652
653

	__vma_link(mm, vma, prev, rb_link, rb_parent);
	__vma_link_file(vma);

	if (mapping)
654
		i_mmap_unlock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
655
656
657
658
659
660

	mm->map_count++;
	validate_mm(mm);
}

/*
661
 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
662
 * mm's list and rbtree.  It has already been inserted into the interval tree.
Linus Torvalds's avatar
Linus Torvalds committed
663
 */
ZhenwenXu's avatar
ZhenwenXu committed
664
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
665
{
666
	struct vm_area_struct *prev;
ZhenwenXu's avatar
ZhenwenXu committed
667
	struct rb_node **rb_link, *rb_parent;
Linus Torvalds's avatar
Linus Torvalds committed
668

669
670
671
	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
			   &prev, &rb_link, &rb_parent))
		BUG();
Linus Torvalds's avatar
Linus Torvalds committed
672
673
674
675
	__vma_link(mm, vma, prev, rb_link, rb_parent);
	mm->map_count++;
}

676
677
678
static __always_inline void __vma_unlink_common(struct mm_struct *mm,
						struct vm_area_struct *vma,
						struct vm_area_struct *prev,
679
680
						bool has_prev,
						struct vm_area_struct *ignore)
Linus Torvalds's avatar
Linus Torvalds committed
681
{
682
	struct vm_area_struct *next;
683

684
	vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
685
686
687
688
689
690
691
692
693
694
	next = vma->vm_next;
	if (has_prev)
		prev->vm_next = next;
	else {
		prev = vma->vm_prev;
		if (prev)
			prev->vm_next = next;
		else
			mm->mmap = next;
	}
695
696
	if (next)
		next->vm_prev = prev;
Davidlohr Bueso's avatar
Davidlohr Bueso committed
697
698
699

	/* Kill the cache */
	vmacache_invalidate(mm);
Linus Torvalds's avatar
Linus Torvalds committed
700
701
}

702
703
704
705
static inline void __vma_unlink_prev(struct mm_struct *mm,
				     struct vm_area_struct *vma,
				     struct vm_area_struct *prev)
{
706
	__vma_unlink_common(mm, vma, prev, true, vma);
707
708
}

Linus Torvalds's avatar
Linus Torvalds committed
709
710
711
712
713
714
715
/*
 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
 * is already present in an i_mmap tree without adjusting the tree.
 * The following helper function should be used when such adjustments
 * are necessary.  The "insert" vma (if any) is to be inserted
 * before we drop the necessary locks.
 */
716
717
718
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
	struct vm_area_struct *expand)
Linus Torvalds's avatar
Linus Torvalds committed
719
720
{
	struct mm_struct *mm = vma->vm_mm;
721
	struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
Linus Torvalds's avatar
Linus Torvalds committed
722
	struct address_space *mapping = NULL;
723
	struct rb_root_cached *root = NULL;
724
	struct anon_vma *anon_vma = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
725
	struct file *file = vma->vm_file;
726
	bool start_changed = false, end_changed = false;
Linus Torvalds's avatar
Linus Torvalds committed
727
728
729
730
	long adjust_next = 0;
	int remove_next = 0;

	if (next && !insert) {
731
		struct vm_area_struct *exporter = NULL, *importer = NULL;
732

Linus Torvalds's avatar
Linus Torvalds committed
733
734
735
736
		if (end >= next->vm_end) {
			/*
			 * vma expands, overlapping all the next, and
			 * perhaps the one after too (mprotect case 6).
737
			 * The only other cases that gets here are
738
			 * case 1, case 7 and case 8.
Linus Torvalds's avatar
Linus Torvalds committed
739
			 */
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
			if (next == expand) {
				/*
				 * The only case where we don't expand "vma"
				 * and we expand "next" instead is case 8.
				 */
				VM_WARN_ON(end != next->vm_end);
				/*
				 * remove_next == 3 means we're
				 * removing "vma" and that to do so we
				 * swapped "vma" and "next".
				 */
				remove_next = 3;
				VM_WARN_ON(file != next->vm_file);
				swap(vma, next);
			} else {
				VM_WARN_ON(expand != vma);
				/*
				 * case 1, 6, 7, remove_next == 2 is case 6,
				 * remove_next == 1 is case 1 or 7.
				 */
				remove_next = 1 + (end > next->vm_end);
				VM_WARN_ON(remove_next == 2 &&
					   end != next->vm_next->vm_end);
				VM_WARN_ON(remove_next == 1 &&
					   end != next->vm_end);
				/* trim end to next, for case 6 first pass */
				end = next->vm_end;
			}

769
			exporter = next;
Linus Torvalds's avatar
Linus Torvalds committed
770
			importer = vma;
771
772
773
774
775

			/*
			 * If next doesn't have anon_vma, import from vma after
			 * next, if the vma overlaps with it.
			 */
776
			if (remove_next == 2 && !next->anon_vma)
777
778
				exporter = next->vm_next;

Linus Torvalds's avatar
Linus Torvalds committed
779
780
781
782
783
784
		} else if (end > next->vm_start) {
			/*
			 * vma expands, overlapping part of the next:
			 * mprotect case 5 shifting the boundary up.
			 */
			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
785
			exporter = next;
Linus Torvalds's avatar
Linus Torvalds committed
786
			importer = vma;
787
			VM_WARN_ON(expand != importer);
Linus Torvalds's avatar
Linus Torvalds committed
788
789
790
791
792
793
		} else if (end < vma->vm_end) {
			/*
			 * vma shrinks, and !insert tells it's not
			 * split_vma inserting another: so it must be
			 * mprotect case 4 shifting the boundary down.
			 */
vishnu.ps's avatar
vishnu.ps committed
794
			adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
795
			exporter = vma;
Linus Torvalds's avatar
Linus Torvalds committed
796
			importer = next;
797
			VM_WARN_ON(expand != importer);
Linus Torvalds's avatar
Linus Torvalds committed
798
799
		}

800
801
802
803
804
		/*
		 * Easily overlooked: when mprotect shifts the boundary,
		 * make sure the expanding vma has anon_vma set if the
		 * shrinking vma had, to cover any anon pages imported.
		 */
805
		if (exporter && exporter->anon_vma && !importer->anon_vma) {
806
807
			int error;

808
			importer->anon_vma = exporter->anon_vma;
809
			error = anon_vma_clone(importer, exporter);
810
			if (error)
811
				return error;
812
813
		}
	}
814
again:
815
	vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
816

Linus Torvalds's avatar
Linus Torvalds committed
817
818
	if (file) {
		mapping = file->f_mapping;
819
820
		root = &mapping->i_mmap;
		uprobe_munmap(vma, vma->vm_start, vma->vm_end);
821

822
823
		if (adjust_next)
			uprobe_munmap(next, next->vm_start, next->vm_end);
824

825
		i_mmap_lock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
826
827
		if (insert) {
			/*
828
			 * Put into interval tree now, so instantiated pages
Linus Torvalds's avatar
Linus Torvalds committed
829
830
831
832
833
834
835
836
			 * are visible to arm/parisc __flush_dcache_page
			 * throughout; but we cannot insert into address
			 * space until vma start or end is updated.
			 */
			__vma_link_file(insert);
		}
	}

837
838
839
840
	anon_vma = vma->anon_vma;
	if (!anon_vma && adjust_next)
		anon_vma = next->anon_vma;
	if (anon_vma) {
841
842
		VM_WARN_ON(adjust_next && next->anon_vma &&
			   anon_vma != next->anon_vma);
843
		anon_vma_lock_write(anon_vma);
844
845
846
847
		anon_vma_interval_tree_pre_update_vma(vma);
		if (adjust_next)
			anon_vma_interval_tree_pre_update_vma(next);
	}
848

Linus Torvalds's avatar
Linus Torvalds committed
849
850
	if (root) {
		flush_dcache_mmap_lock(mapping);
851
		vma_interval_tree_remove(vma, root);
Linus Torvalds's avatar
Linus Torvalds committed
852
		if (adjust_next)
853
			vma_interval_tree_remove(next, root);
Linus Torvalds's avatar
Linus Torvalds committed
854
855
	}

856
857
858
859
860
861
862
863
	if (start != vma->vm_start) {
		vma->vm_start = start;
		start_changed = true;
	}
	if (end != vma->vm_end) {
		vma->vm_end = end;
		end_changed = true;
	}
Linus Torvalds's avatar
Linus Torvalds committed
864
865
866
867
868
869
870
871
	vma->vm_pgoff = pgoff;
	if (adjust_next) {
		next->vm_start += adjust_next << PAGE_SHIFT;
		next->vm_pgoff += adjust_next;
	}

	if (root) {
		if (adjust_next)
872
873
			vma_interval_tree_insert(next, root);
		vma_interval_tree_insert(vma, root);
Linus Torvalds's avatar
Linus Torvalds committed
874
875
876
877
878
879
880
881
		flush_dcache_mmap_unlock(mapping);
	}

	if (remove_next) {
		/*
		 * vma_merge has merged next into vma, and needs
		 * us to remove next before dropping the locks.
		 */
882
883
884
		if (remove_next != 3)
			__vma_unlink_prev(mm, next, vma);
		else
885
886
887
888
889
890
891
892
893
894
			/*
			 * vma is not before next if they've been
			 * swapped.
			 *
			 * pre-swap() next->vm_start was reduced so
			 * tell validate_mm_rb to ignore pre-swap()
			 * "next" (which is stored in post-swap()
			 * "vma").
			 */
			__vma_unlink_common(mm, next, NULL, false, vma);
Linus Torvalds's avatar
Linus Torvalds committed
895
896
897
898
899
900
901
902
903
		if (file)
			__remove_shared_vm_struct(next, file, mapping);
	} else if (insert) {
		/*
		 * split_vma has split insert from vma, and needs
		 * us to insert it before dropping the locks
		 * (it may either follow vma or precede it).
		 */
		__insert_vm_struct(mm, insert);
904
905
906
907
908
	} else {
		if (start_changed)
			vma_gap_update(vma);
		if (end_changed) {
			if (!next)
909
				mm->highest_vm_end = vm_end_gap(vma);
910
911
912
			else if (!adjust_next)
				vma_gap_update(next);
		}
Linus Torvalds's avatar
Linus Torvalds committed
913
914
	}

915
916
917
918
	if (anon_vma) {
		anon_vma_interval_tree_post_update_vma(vma);
		if (adjust_next)
			anon_vma_interval_tree_post_update_vma(next);
919
		anon_vma_unlock_write(anon_vma);
920
	}
Linus Torvalds's avatar
Linus Torvalds committed
921
	if (mapping)
922
		i_mmap_unlock_write(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
923

924
	if (root) {
925
		uprobe_mmap(vma);
926
927

		if (adjust_next)
928
			uprobe_mmap(next);
929
930
	}

Linus Torvalds's avatar
Linus Torvalds committed
931
	if (remove_next) {
Matt Helsley's avatar
Matt Helsley committed
932
		if (file) {
933
			uprobe_munmap(next, next->vm_start, next->vm_end);
Linus Torvalds's avatar
Linus Torvalds committed
934
			fput(file);
Matt Helsley's avatar
Matt Helsley committed
935
		}
936
937
		if (next->anon_vma)
			anon_vma_merge(vma, next);
Linus Torvalds's avatar
Linus Torvalds committed
938
		mm->map_count--;
939
		mpol_put(vma_policy(next));
940
		vm_area_free(next);
Linus Torvalds's avatar
Linus Torvalds committed
941
942
943
944
945
		/*
		 * In mprotect's case 6 (see comments on vma_merge),
		 * we must remove another next too. It would clutter
		 * up the code too much to do both in one go.
		 */
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
		if (remove_next != 3) {
			/*
			 * If "next" was removed and vma->vm_end was
			 * expanded (up) over it, in turn
			 * "next->vm_prev->vm_end" changed and the
			 * "vma->vm_next" gap must be updated.
			 */
			next = vma->vm_next;
		} else {
			/*
			 * For the scope of the comment "next" and
			 * "vma" considered pre-swap(): if "vma" was
			 * removed, next->vm_start was expanded (down)
			 * over it and the "next" gap must be updated.
			 * Because of the swap() the post-swap() "vma"
			 * actually points to pre-swap() "next"
			 * (post-swap() "next" as opposed is now a
			 * dangling pointer).
			 */
			next = vma;
		}
967
968
969
		if (remove_next == 2) {
			remove_next = 1;
			end = next->vm_end;
Linus Torvalds's avatar
Linus Torvalds committed
970
			goto again;
971
		}
972
973
		else if (next)
			vma_gap_update(next);
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
		else {
			/*
			 * If remove_next == 2 we obviously can't
			 * reach this path.
			 *
			 * If remove_next == 3 we can't reach this
			 * path because pre-swap() next is always not
			 * NULL. pre-swap() "next" is not being
			 * removed and its next->vm_end is not altered
			 * (and furthermore "end" already matches
			 * next->vm_end in remove_next == 3).
			 *
			 * We reach this only in the remove_next == 1
			 * case if the "next" vma that was removed was
			 * the highest vma of the mm. However in such
			 * case next->vm_end == "end" and the extended
			 * "vma" has vma->vm_end == next->vm_end so
			 * mm->highest_vm_end doesn't need any update
			 * in remove_next == 1 case.
			 */
994
			VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
995
		}
Linus Torvalds's avatar
Linus Torvalds committed
996
	}
997
	if (insert && file)
998
		uprobe_mmap(insert);
Linus Torvalds's avatar
Linus Torvalds committed
999
1000

	validate_mm(mm);
For faster browsing, not all history is shown. View entire blame