vmalloc.c 92.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2
3
4
5
6
7
8
/*
 *  linux/mm/vmalloc.c
 *
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
Christoph Lameter's avatar
Christoph Lameter committed
9
 *  Numa awareness, Christoph Lameter, SGI, June 2005
Linus Torvalds's avatar
Linus Torvalds committed
10
11
 */

Nick Piggin's avatar
Nick Piggin committed
12
#include <linux/vmalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
13
14
15
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
16
#include <linux/sched/signal.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
18
19
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
20
#include <linux/proc_fs.h>
21
#include <linux/seq_file.h>
22
#include <linux/set_memory.h>
23
#include <linux/debugobjects.h>
24
#include <linux/kallsyms.h>
Nick Piggin's avatar
Nick Piggin committed
25
#include <linux/list.h>
26
#include <linux/notifier.h>
Nick Piggin's avatar
Nick Piggin committed
27
28
29
#include <linux/rbtree.h>
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
30
#include <linux/pfn.h>
31
#include <linux/kmemleak.h>
Arun Sharma's avatar
Arun Sharma committed
32
#include <linux/atomic.h>
33
#include <linux/compiler.h>
34
#include <linux/llist.h>
35
#include <linux/bitops.h>
36
#include <linux/rbtree_augmented.h>
37

38
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
39
#include <asm/tlbflush.h>
40
#include <asm/shmparam.h>
Linus Torvalds's avatar
Linus Torvalds committed
41

42
43
#include "internal.h"

44
45
46
47
48
49
50
51
52
53
54
struct vfree_deferred {
	struct llist_head list;
	struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

static void __vunmap(const void *, int);

static void free_work(struct work_struct *w)
{
	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
55
56
57
58
	struct llist_node *t, *llnode;

	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
		__vunmap((void *)llnode, 1);
59
60
}

Nick Piggin's avatar
Nick Piggin committed
61
/*** Page table manipulation functions ***/
62

Linus Torvalds's avatar
Linus Torvalds committed
63
64
65
66
67
68
69
70
71
72
73
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
{
	pte_t *pte;

	pte = pte_offset_kernel(pmd, addr);
	do {
		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
	} while (pte++, addr += PAGE_SIZE, addr != end);
}

Nick Piggin's avatar
Nick Piggin committed
74
static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
75
76
77
78
79
80
81
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
82
83
		if (pmd_clear_huge(pmd))
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
84
85
86
87
88
89
		if (pmd_none_or_clear_bad(pmd))
			continue;
		vunmap_pte_range(pmd, addr, next);
	} while (pmd++, addr = next, addr != end);
}

90
static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
91
92
93
94
{
	pud_t *pud;
	unsigned long next;

95
	pud = pud_offset(p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
96
97
	do {
		next = pud_addr_end(addr, end);
98
99
		if (pud_clear_huge(pud))
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
100
101
102
103
104
105
		if (pud_none_or_clear_bad(pud))
			continue;
		vunmap_pmd_range(pud, addr, next);
	} while (pud++, addr = next, addr != end);
}

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
{
	p4d_t *p4d;
	unsigned long next;

	p4d = p4d_offset(pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_clear_huge(p4d))
			continue;
		if (p4d_none_or_clear_bad(p4d))
			continue;
		vunmap_pud_range(p4d, addr, next);
	} while (p4d++, addr = next, addr != end);
}

Nick Piggin's avatar
Nick Piggin committed
122
static void vunmap_page_range(unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
123
124
125
126
127
128
129
130
131
132
{
	pgd_t *pgd;
	unsigned long next;

	BUG_ON(addr >= end);
	pgd = pgd_offset_k(addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
133
		vunmap_p4d_range(pgd, addr, next);
Linus Torvalds's avatar
Linus Torvalds committed
134
135
136
137
	} while (pgd++, addr = next, addr != end);
}

static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
Nick Piggin's avatar
Nick Piggin committed
138
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
Linus Torvalds's avatar
Linus Torvalds committed
139
140
141
{
	pte_t *pte;

Nick Piggin's avatar
Nick Piggin committed
142
143
144
145
146
	/*
	 * nr is a running index into the array which helps higher level
	 * callers keep track of where we're up to.
	 */

147
	pte = pte_alloc_kernel(pmd, addr);
Linus Torvalds's avatar
Linus Torvalds committed
148
149
150
	if (!pte)
		return -ENOMEM;
	do {
Nick Piggin's avatar
Nick Piggin committed
151
152
153
154
155
		struct page *page = pages[*nr];

		if (WARN_ON(!pte_none(*pte)))
			return -EBUSY;
		if (WARN_ON(!page))
Linus Torvalds's avatar
Linus Torvalds committed
156
157
			return -ENOMEM;
		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
Nick Piggin's avatar
Nick Piggin committed
158
		(*nr)++;
Linus Torvalds's avatar
Linus Torvalds committed
159
160
161
162
	} while (pte++, addr += PAGE_SIZE, addr != end);
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
163
164
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
Linus Torvalds's avatar
Linus Torvalds committed
165
166
167
168
169
170
171
172
173
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_alloc(&init_mm, pud, addr);
	if (!pmd)
		return -ENOMEM;
	do {
		next = pmd_addr_end(addr, end);
Nick Piggin's avatar
Nick Piggin committed
174
		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
Linus Torvalds's avatar
Linus Torvalds committed
175
176
177
178
179
			return -ENOMEM;
	} while (pmd++, addr = next, addr != end);
	return 0;
}

180
static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
Nick Piggin's avatar
Nick Piggin committed
181
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
Linus Torvalds's avatar
Linus Torvalds committed
182
183
184
185
{
	pud_t *pud;
	unsigned long next;

186
	pud = pud_alloc(&init_mm, p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
187
188
189
190
	if (!pud)
		return -ENOMEM;
	do {
		next = pud_addr_end(addr, end);
Nick Piggin's avatar
Nick Piggin committed
191
		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
Linus Torvalds's avatar
Linus Torvalds committed
192
193
194
195
196
			return -ENOMEM;
	} while (pud++, addr = next, addr != end);
	return 0;
}

197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
	p4d_t *p4d;
	unsigned long next;

	p4d = p4d_alloc(&init_mm, pgd, addr);
	if (!p4d)
		return -ENOMEM;
	do {
		next = p4d_addr_end(addr, end);
		if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
			return -ENOMEM;
	} while (p4d++, addr = next, addr != end);
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
214
215
216
217
218
219
/*
 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
 * will have pfns corresponding to the "pages" array.
 *
 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
 */
220
221
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
				   pgprot_t prot, struct page **pages)
Linus Torvalds's avatar
Linus Torvalds committed
222
223
224
{
	pgd_t *pgd;
	unsigned long next;
225
	unsigned long addr = start;
Nick Piggin's avatar
Nick Piggin committed
226
227
	int err = 0;
	int nr = 0;
Linus Torvalds's avatar
Linus Torvalds committed
228
229
230
231
232

	BUG_ON(addr >= end);
	pgd = pgd_offset_k(addr);
	do {
		next = pgd_addr_end(addr, end);
233
		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
Linus Torvalds's avatar
Linus Torvalds committed
234
		if (err)
235
			return err;
Linus Torvalds's avatar
Linus Torvalds committed
236
	} while (pgd++, addr = next, addr != end);
Nick Piggin's avatar
Nick Piggin committed
237
238

	return nr;
Linus Torvalds's avatar
Linus Torvalds committed
239
240
}

241
242
243
244
245
246
247
248
249
250
static int vmap_page_range(unsigned long start, unsigned long end,
			   pgprot_t prot, struct page **pages)
{
	int ret;

	ret = vmap_page_range_noflush(start, end, prot, pages);
	flush_cache_vmap(start, end);
	return ret;
}

251
int is_vmalloc_or_module_addr(const void *x)
252
253
{
	/*
254
	 * ARM, x86-64 and sparc64 put modules in a special place,
255
256
257
258
259
260
261
262
263
264
265
	 * and fall back on vmalloc() if that fails. Others
	 * just put it in the vmalloc space.
	 */
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
	unsigned long addr = (unsigned long)x;
	if (addr >= MODULES_VADDR && addr < MODULES_END)
		return 1;
#endif
	return is_vmalloc_addr(x);
}

266
/*
267
 * Walk a vmap address to the struct page it maps.
268
 */
269
struct page *vmalloc_to_page(const void *vmalloc_addr)
270
271
{
	unsigned long addr = (unsigned long) vmalloc_addr;
272
	struct page *page = NULL;
273
	pgd_t *pgd = pgd_offset_k(addr);
274
275
276
277
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep, pte;
278

279
280
281
282
	/*
	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
	 * architectures that do not vmalloc module space
	 */
283
	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
Jiri Slaby's avatar
Jiri Slaby committed
284

285
286
287
288
289
290
	if (pgd_none(*pgd))
		return NULL;
	p4d = p4d_offset(pgd, addr);
	if (p4d_none(*p4d))
		return NULL;
	pud = pud_offset(p4d, addr);
291
292
293
294
295
296
297
298
299
300
301

	/*
	 * Don't dereference bad PUD or PMD (below) entries. This will also
	 * identify huge mappings, which we may encounter on architectures
	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
	 * not [unambiguously] associated with a struct page, so there is
	 * no correct value to return for them.
	 */
	WARN_ON_ONCE(pud_bad(*pud));
	if (pud_none(*pud) || pud_bad(*pud))
302
303
		return NULL;
	pmd = pmd_offset(pud, addr);
304
305
	WARN_ON_ONCE(pmd_bad(*pmd));
	if (pmd_none(*pmd) || pmd_bad(*pmd))
306
307
308
309
310
311
312
		return NULL;

	ptep = pte_offset_map(pmd, addr);
	pte = *ptep;
	if (pte_present(pte))
		page = pte_page(pte);
	pte_unmap(ptep);
313
	return page;
314
}
315
EXPORT_SYMBOL(vmalloc_to_page);
316
317

/*
318
 * Map a vmalloc()-space virtual address to the physical page frame number.
319
 */
320
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
321
{
322
	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
323
}
324
EXPORT_SYMBOL(vmalloc_to_pfn);
325

Nick Piggin's avatar
Nick Piggin committed
326
327
328

/*** Global kva allocator ***/

329
#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
330
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
331

Nick Piggin's avatar
Nick Piggin committed
332
333

static DEFINE_SPINLOCK(vmap_area_lock);
334
static DEFINE_SPINLOCK(free_vmap_area_lock);
335
336
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
337
static LLIST_HEAD(vmap_purge_list);
Nick Piggin's avatar
Nick Piggin committed
338
static struct rb_root vmap_area_root = RB_ROOT;
339
static bool vmap_initialized __read_mostly;
Nick Piggin's avatar
Nick Piggin committed
340

341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
/*
 * This kmem_cache is used for vmap_area objects. Instead of
 * allocating from slab we reuse an object from this cache to
 * make things faster. Especially in "no edge" splitting of
 * free block.
 */
static struct kmem_cache *vmap_area_cachep;

/*
 * This linked list is used in pair with free_vmap_area_root.
 * It gives O(1) access to prev/next to perform fast coalescing.
 */
static LIST_HEAD(free_vmap_area_list);

/*
 * This augment red-black tree represents the free vmap space.
 * All vmap_area objects in this tree are sorted by va->va_start
 * address. It is used for allocation and merging when a vmap
 * object is released.
 *
 * Each vmap_area node contains a maximum available free block
 * of its sub-tree, right or left. Therefore it is possible to
 * find a lowest match of free area.
 */
static struct rb_root free_vmap_area_root = RB_ROOT;

367
368
369
370
371
372
373
/*
 * Preload a CPU with one object for "no edge" split case. The
 * aim is to get rid of allocations from the atomic context, thus
 * to use more permissive allocation masks.
 */
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);

374
375
376
377
378
379
380
381
382
383
384
385
386
387
static __always_inline unsigned long
va_size(struct vmap_area *va)
{
	return (va->va_end - va->va_start);
}

static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
	struct vmap_area *va;

	va = rb_entry_safe(node, struct vmap_area, rb_node);
	return va ? va->subtree_max_size : 0;
}
Nick Piggin's avatar
Nick Piggin committed
388

389
390
391
392
393
394
395
396
397
398
399
/*
 * Gets called when remove the node and rotate.
 */
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
	return max3(va_size(va),
		get_subtree_max_size(va->rb_node.rb_left),
		get_subtree_max_size(va->rb_node.rb_right));
}

400
401
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
402
403
404
405

static void purge_vmap_area_lazy(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static unsigned long lazy_max_pages(void);
Nick Piggin's avatar
Nick Piggin committed
406

407
408
409
410
411
412
413
static atomic_long_t nr_vmalloc_pages;

unsigned long vmalloc_nr_pages(void)
{
	return atomic_long_read(&nr_vmalloc_pages);
}

Nick Piggin's avatar
Nick Piggin committed
414
static struct vmap_area *__find_vmap_area(unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
415
{
Nick Piggin's avatar
Nick Piggin committed
416
417
418
419
420
421
422
423
	struct rb_node *n = vmap_area_root.rb_node;

	while (n) {
		struct vmap_area *va;

		va = rb_entry(n, struct vmap_area, rb_node);
		if (addr < va->va_start)
			n = n->rb_left;
424
		else if (addr >= va->va_end)
Nick Piggin's avatar
Nick Piggin committed
425
426
427
428
429
430
431
432
			n = n->rb_right;
		else
			return va;
	}

	return NULL;
}

433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/*
 * This function returns back addresses of parent node
 * and its left or right link for further processing.
 */
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
	struct rb_root *root, struct rb_node *from,
	struct rb_node **parent)
{
	struct vmap_area *tmp_va;
	struct rb_node **link;

	if (root) {
		link = &root->rb_node;
		if (unlikely(!*link)) {
			*parent = NULL;
			return link;
		}
	} else {
		link = &from;
	}
Nick Piggin's avatar
Nick Piggin committed
454

455
456
457
458
459
460
461
	/*
	 * Go to the bottom of the tree. When we hit the last point
	 * we end up with parent rb_node and correct direction, i name
	 * it link, where the new va->rb_node will be attached to.
	 */
	do {
		tmp_va = rb_entry(*link, struct vmap_area, rb_node);
Nick Piggin's avatar
Nick Piggin committed
462

463
464
465
466
467
468
469
470
471
472
473
		/*
		 * During the traversal we also do some sanity check.
		 * Trigger the BUG() if there are sides(left/right)
		 * or full overlaps.
		 */
		if (va->va_start < tmp_va->va_end &&
				va->va_end <= tmp_va->va_start)
			link = &(*link)->rb_left;
		else if (va->va_end > tmp_va->va_start &&
				va->va_start >= tmp_va->va_end)
			link = &(*link)->rb_right;
Nick Piggin's avatar
Nick Piggin committed
474
475
		else
			BUG();
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
	} while (*link);

	*parent = &tmp_va->rb_node;
	return link;
}

static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
	struct list_head *list;

	if (unlikely(!parent))
		/*
		 * The red-black tree where we try to find VA neighbors
		 * before merging or inserting is empty, i.e. it means
		 * there is no free vmap space. Normally it does not
		 * happen but we handle this case anyway.
		 */
		return NULL;

	list = &rb_entry(parent, struct vmap_area, rb_node)->list;
	return (&parent->rb_right == link ? list->next : list);
}

static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
	struct rb_node *parent, struct rb_node **link, struct list_head *head)
{
	/*
	 * VA is still not in the list, but we can
	 * identify its future previous list_head node.
	 */
	if (likely(parent)) {
		head = &rb_entry(parent, struct vmap_area, rb_node)->list;
		if (&parent->rb_right != link)
			head = head->prev;
Nick Piggin's avatar
Nick Piggin committed
512
513
	}

514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
	/* Insert to the rb-tree */
	rb_link_node(&va->rb_node, parent, link);
	if (root == &free_vmap_area_root) {
		/*
		 * Some explanation here. Just perform simple insertion
		 * to the tree. We do not set va->subtree_max_size to
		 * its current size before calling rb_insert_augmented().
		 * It is because of we populate the tree from the bottom
		 * to parent levels when the node _is_ in the tree.
		 *
		 * Therefore we set subtree_max_size to zero after insertion,
		 * to let __augment_tree_propagate_from() puts everything to
		 * the correct order later on.
		 */
		rb_insert_augmented(&va->rb_node,
			root, &free_vmap_area_rb_augment_cb);
		va->subtree_max_size = 0;
	} else {
		rb_insert_color(&va->rb_node, root);
	}
Nick Piggin's avatar
Nick Piggin committed
534

535
536
	/* Address-sort this list */
	list_add(&va->list, head);
Nick Piggin's avatar
Nick Piggin committed
537
538
}

539
540
541
static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
542
543
	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
		return;
Nick Piggin's avatar
Nick Piggin committed
544

545
546
547
548
549
550
551
552
	if (root == &free_vmap_area_root)
		rb_erase_augmented(&va->rb_node,
			root, &free_vmap_area_rb_augment_cb);
	else
		rb_erase(&va->rb_node, root);

	list_del(&va->list);
	RB_CLEAR_NODE(&va->rb_node);
553
554
}

555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
#if DEBUG_AUGMENT_PROPAGATE_CHECK
static void
augment_tree_propagate_check(struct rb_node *n)
{
	struct vmap_area *va;
	struct rb_node *node;
	unsigned long size;
	bool found = false;

	if (n == NULL)
		return;

	va = rb_entry(n, struct vmap_area, rb_node);
	size = va->subtree_max_size;
	node = n;

	while (node) {
		va = rb_entry(node, struct vmap_area, rb_node);

		if (get_subtree_max_size(node->rb_left) == size) {
			node = node->rb_left;
		} else {
			if (va_size(va) == size) {
				found = true;
				break;
			}

			node = node->rb_right;
		}
	}

	if (!found) {
		va = rb_entry(n, struct vmap_area, rb_node);
		pr_emerg("tree is corrupted: %lu, %lu\n",
			va_size(va), va->subtree_max_size);
	}

	augment_tree_propagate_check(n->rb_left);
	augment_tree_propagate_check(n->rb_right);
}
#endif

597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
/*
 * This function populates subtree_max_size from bottom to upper
 * levels starting from VA point. The propagation must be done
 * when VA size is modified by changing its va_start/va_end. Or
 * in case of newly inserting of VA to the tree.
 *
 * It means that __augment_tree_propagate_from() must be called:
 * - After VA has been inserted to the tree(free path);
 * - After VA has been shrunk(allocation path);
 * - After VA has been increased(merging path).
 *
 * Please note that, it does not mean that upper parent nodes
 * and their subtree_max_size are recalculated all the time up
 * to the root node.
 *
 *       4--8
 *        /\
 *       /  \
 *      /    \
 *    2--2  8--8
 *
 * For example if we modify the node 4, shrinking it to 2, then
 * no any modification is required. If we shrink the node 2 to 1
 * its subtree_max_size is updated only, and set to 1. If we shrink
 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 * node becomes 4--6.
 */
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
	struct rb_node *node = &va->rb_node;
	unsigned long new_va_sub_max_size;

	while (node) {
		va = rb_entry(node, struct vmap_area, rb_node);
		new_va_sub_max_size = compute_subtree_max_size(va);

		/*
		 * If the newly calculated maximum available size of the
		 * subtree is equal to the current one, then it means that
		 * the tree is propagated correctly. So we have to stop at
		 * this point to save cycles.
		 */
		if (va->subtree_max_size == new_va_sub_max_size)
			break;

		va->subtree_max_size = new_va_sub_max_size;
		node = rb_parent(&va->rb_node);
	}
646
647
648
649

#if DEBUG_AUGMENT_PROPAGATE_CHECK
	augment_tree_propagate_check(free_vmap_area_root.rb_node);
#endif
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
}

static void
insert_vmap_area(struct vmap_area *va,
	struct rb_root *root, struct list_head *head)
{
	struct rb_node **link;
	struct rb_node *parent;

	link = find_va_links(va, root, NULL, &parent);
	link_va(va, root, parent, link, head);
}

static void
insert_vmap_area_augment(struct vmap_area *va,
	struct rb_node *from, struct rb_root *root,
	struct list_head *head)
{
	struct rb_node **link;
	struct rb_node *parent;

	if (from)
		link = find_va_links(va, NULL, from, &parent);
	else
		link = find_va_links(va, root, NULL, &parent);

	link_va(va, root, parent, link, head);
	augment_tree_propagate_from(va);
}

/*
 * Merge de-allocated chunk of VA memory with previous
 * and next free blocks. If coalesce is not done a new
 * free area is inserted. If VA has been merged, it is
 * freed.
 */
686
static __always_inline struct vmap_area *
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
merge_or_add_vmap_area(struct vmap_area *va,
	struct rb_root *root, struct list_head *head)
{
	struct vmap_area *sibling;
	struct list_head *next;
	struct rb_node **link;
	struct rb_node *parent;
	bool merged = false;

	/*
	 * Find a place in the tree where VA potentially will be
	 * inserted, unless it is merged with its sibling/siblings.
	 */
	link = find_va_links(va, root, NULL, &parent);

	/*
	 * Get next node of VA to check if merging can be done.
	 */
	next = get_va_next_sibling(parent, link);
	if (unlikely(next == NULL))
		goto insert;

	/*
	 * start            end
	 * |                |
	 * |<------VA------>|<-----Next----->|
	 *                  |                |
	 *                  start            end
	 */
	if (next != head) {
		sibling = list_entry(next, struct vmap_area, list);
		if (sibling->va_start == va->va_end) {
			sibling->va_start = va->va_start;

			/* Check and update the tree if needed. */
			augment_tree_propagate_from(sibling);

			/* Free vmap_area object. */
			kmem_cache_free(vmap_area_cachep, va);

			/* Point to the new merged area. */
			va = sibling;
			merged = true;
		}
	}

	/*
	 * start            end
	 * |                |
	 * |<-----Prev----->|<------VA------>|
	 *                  |                |
	 *                  start            end
	 */
	if (next->prev != head) {
		sibling = list_entry(next->prev, struct vmap_area, list);
		if (sibling->va_end == va->va_start) {
			sibling->va_end = va->va_end;

			/* Check and update the tree if needed. */
			augment_tree_propagate_from(sibling);

748
749
			if (merged)
				unlink_va(va, root);
750
751
752

			/* Free vmap_area object. */
			kmem_cache_free(vmap_area_cachep, va);
753
754
755
756

			/* Point to the new merged area. */
			va = sibling;
			merged = true;
757
758
759
760
761
762
763
764
		}
	}

insert:
	if (!merged) {
		link_va(va, root, parent, link, head);
		augment_tree_propagate_from(va);
	}
765
766

	return va;
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
}

static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
	unsigned long align, unsigned long vstart)
{
	unsigned long nva_start_addr;

	if (va->va_start > vstart)
		nva_start_addr = ALIGN(va->va_start, align);
	else
		nva_start_addr = ALIGN(vstart, align);

	/* Can be overflowed due to big size or alignment. */
	if (nva_start_addr + size < nva_start_addr ||
			nva_start_addr < vstart)
		return false;

	return (nva_start_addr + size <= va->va_end);
}

/*
 * Find the first free block(lowest start address) in the tree,
 * that will accomplish the request corresponding to passing
 * parameters.
 */
static __always_inline struct vmap_area *
find_vmap_lowest_match(unsigned long size,
	unsigned long align, unsigned long vstart)
{
	struct vmap_area *va;
	struct rb_node *node;
	unsigned long length;

	/* Start from the root. */
	node = free_vmap_area_root.rb_node;

	/* Adjust the search size for alignment overhead. */
	length = size + align - 1;

	while (node) {
		va = rb_entry(node, struct vmap_area, rb_node);

		if (get_subtree_max_size(node->rb_left) >= length &&
				vstart < va->va_start) {
			node = node->rb_left;
		} else {
			if (is_within_this_va(va, size, align, vstart))
				return va;

			/*
			 * Does not make sense to go deeper towards the right
			 * sub-tree if it does not have a free block that is
			 * equal or bigger to the requested search length.
			 */
			if (get_subtree_max_size(node->rb_right) >= length) {
				node = node->rb_right;
				continue;
			}

			/*
828
			 * OK. We roll back and find the first right sub-tree,
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
			 * that will satisfy the search criteria. It can happen
			 * only once due to "vstart" restriction.
			 */
			while ((node = rb_parent(node))) {
				va = rb_entry(node, struct vmap_area, rb_node);
				if (is_within_this_va(va, size, align, vstart))
					return va;

				if (get_subtree_max_size(node->rb_right) >= length &&
						vstart <= va->va_start) {
					node = node->rb_right;
					break;
				}
			}
		}
	}

	return NULL;
}

849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>

static struct vmap_area *
find_vmap_lowest_linear_match(unsigned long size,
	unsigned long align, unsigned long vstart)
{
	struct vmap_area *va;

	list_for_each_entry(va, &free_vmap_area_list, list) {
		if (!is_within_this_va(va, size, align, vstart))
			continue;

		return va;
	}

	return NULL;
}

static void
find_vmap_lowest_match_check(unsigned long size)
{
	struct vmap_area *va_1, *va_2;
	unsigned long vstart;
	unsigned int rnd;

	get_random_bytes(&rnd, sizeof(rnd));
	vstart = VMALLOC_START + rnd;

	va_1 = find_vmap_lowest_match(size, 1, vstart);
	va_2 = find_vmap_lowest_linear_match(size, 1, vstart);

	if (va_1 != va_2)
		pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
			va_1, va_2, vstart);
}
#endif

887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
enum fit_type {
	NOTHING_FIT = 0,
	FL_FIT_TYPE = 1,	/* full fit */
	LE_FIT_TYPE = 2,	/* left edge fit */
	RE_FIT_TYPE = 3,	/* right edge fit */
	NE_FIT_TYPE = 4		/* no edge fit */
};

static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
	unsigned long nva_start_addr, unsigned long size)
{
	enum fit_type type;

	/* Check if it is within VA. */
	if (nva_start_addr < va->va_start ||
			nva_start_addr + size > va->va_end)
		return NOTHING_FIT;

	/* Now classify. */
	if (va->va_start == nva_start_addr) {
		if (va->va_end == nva_start_addr + size)
			type = FL_FIT_TYPE;
		else
			type = LE_FIT_TYPE;
	} else if (va->va_end == nva_start_addr + size) {
		type = RE_FIT_TYPE;
	} else {
		type = NE_FIT_TYPE;
	}

	return type;
}

static __always_inline int
adjust_va_to_fit_type(struct vmap_area *va,
	unsigned long nva_start_addr, unsigned long size,
	enum fit_type type)
{
926
	struct vmap_area *lva = NULL;
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963

	if (type == FL_FIT_TYPE) {
		/*
		 * No need to split VA, it fully fits.
		 *
		 * |               |
		 * V      NVA      V
		 * |---------------|
		 */
		unlink_va(va, &free_vmap_area_root);
		kmem_cache_free(vmap_area_cachep, va);
	} else if (type == LE_FIT_TYPE) {
		/*
		 * Split left edge of fit VA.
		 *
		 * |       |
		 * V  NVA  V   R
		 * |-------|-------|
		 */
		va->va_start += size;
	} else if (type == RE_FIT_TYPE) {
		/*
		 * Split right edge of fit VA.
		 *
		 *         |       |
		 *     L   V  NVA  V
		 * |-------|-------|
		 */
		va->va_end = nva_start_addr;
	} else if (type == NE_FIT_TYPE) {
		/*
		 * Split no edge of fit VA.
		 *
		 *     |       |
		 *   L V  NVA  V R
		 * |---|-------|---|
		 */
964
965
966
967
968
969
970
971
972
973
974
975
976
		lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
		if (unlikely(!lva)) {
			/*
			 * For percpu allocator we do not do any pre-allocation
			 * and leave it as it is. The reason is it most likely
			 * never ends up with NE_FIT_TYPE splitting. In case of
			 * percpu allocations offsets and sizes are aligned to
			 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
			 * are its main fitting cases.
			 *
			 * There are a few exceptions though, as an example it is
			 * a first allocation (early boot up) when we have "one"
			 * big free space that has to be split.
977
978
979
980
981
982
983
984
985
986
987
988
989
			 *
			 * Also we can hit this path in case of regular "vmap"
			 * allocations, if "this" current CPU was not preloaded.
			 * See the comment in alloc_vmap_area() why. If so, then
			 * GFP_NOWAIT is used instead to get an extra object for
			 * split purpose. That is rare and most time does not
			 * occur.
			 *
			 * What happens if an allocation gets failed. Basically,
			 * an "overflow" path is triggered to purge lazily freed
			 * areas to free some memory, then, the "retry" path is
			 * triggered to repeat one more time. See more details
			 * in alloc_vmap_area() function.
990
991
992
993
994
			 */
			lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
			if (!lva)
				return -1;
		}
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012

		/*
		 * Build the remainder.
		 */
		lva->va_start = va->va_start;
		lva->va_end = nva_start_addr;

		/*
		 * Shrink this VA to remaining size.
		 */
		va->va_start = nva_start_addr + size;
	} else {
		return -1;
	}

	if (type != FL_FIT_TYPE) {
		augment_tree_propagate_from(va);

1013
		if (lva)	/* type == NE_FIT_TYPE */
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
			insert_vmap_area_augment(lva, &va->rb_node,
				&free_vmap_area_root, &free_vmap_area_list);
	}

	return 0;
}

/*
 * Returns a start address of the newly allocated area, if success.
 * Otherwise a vend is returned that indicates failure.
 */
static __always_inline unsigned long
__alloc_vmap_area(unsigned long size, unsigned long align,
1027
	unsigned long vstart, unsigned long vend)
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
{
	unsigned long nva_start_addr;
	struct vmap_area *va;
	enum fit_type type;
	int ret;

	va = find_vmap_lowest_match(size, align, vstart);
	if (unlikely(!va))
		return vend;

	if (va->va_start > vstart)
		nva_start_addr = ALIGN(va->va_start, align);
	else
		nva_start_addr = ALIGN(vstart, align);

	/* Check the "vend" restriction. */
	if (nva_start_addr + size > vend)
		return vend;

	/* Classify what we have found. */
	type = classify_va_fit_type(va, nva_start_addr, size);
	if (WARN_ON_ONCE(type == NOTHING_FIT))
		return vend;

	/* Update the free vmap_area. */
	ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
	if (ret)
		return vend;

1057
1058
1059
1060
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
	find_vmap_lowest_match_check(size);
#endif

1061
1062
	return nva_start_addr;
}
1063

1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
/*
 * Free a region of KVA allocated by alloc_vmap_area
 */
static void free_vmap_area(struct vmap_area *va)
{
	/*
	 * Remove from the busy tree/list.
	 */
	spin_lock(&vmap_area_lock);
	unlink_va(va, &vmap_area_root);
	spin_unlock(&vmap_area_lock);

	/*
	 * Insert/Merge it back to the free tree/list.
	 */
	spin_lock(&free_vmap_area_lock);
	merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
	spin_unlock(&free_vmap_area_lock);
}

Nick Piggin's avatar
Nick Piggin committed
1084
1085
1086
1087
1088
1089
1090
1091
1092
/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
				unsigned long align,
				unsigned long vstart, unsigned long vend,
				int node, gfp_t gfp_mask)
{
1093
	struct vmap_area *va, *pva;
Linus Torvalds's avatar
Linus Torvalds committed
1094
	unsigned long addr;
Nick Piggin's avatar
Nick Piggin committed
1095
	int purged = 0;
1096
	int ret;
Nick Piggin's avatar
Nick Piggin committed
1097

Nick Piggin's avatar
Nick Piggin committed
1098
	BUG_ON(!size);
1099
	BUG_ON(offset_in_page(size));
Nick Piggin's avatar
Nick Piggin committed
1100
	BUG_ON(!is_power_of_2(align));
Nick Piggin's avatar
Nick Piggin committed
1101

1102
1103
1104
	if (unlikely(!vmap_initialized))
		return ERR_PTR(-EBUSY);

1105
	might_sleep();
1106
	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
1107

1108
	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
Nick Piggin's avatar
Nick Piggin committed
1109
1110
1111
	if (unlikely(!va))
		return ERR_PTR(-ENOMEM);

1112
1113
1114
1115
	/*
	 * Only scan the relevant parts containing pointers to other objects
	 * to avoid false negatives.
	 */
1116
	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
1117

Nick Piggin's avatar
Nick Piggin committed
1118
retry:
1119
	/*
1120
1121
1122
1123
1124
1125
	 * Preload this CPU with one extra vmap_area object. It is used
	 * when fit type of free area is NE_FIT_TYPE. Please note, it
	 * does not guarantee that an allocation occurs on a CPU that
	 * is preloaded, instead we minimize the case when it is not.
	 * It can happen because of cpu migration, because there is a
	 * race until the below spinlock is taken.
1126
1127
1128
	 *
	 * The preload is done in non-atomic context, thus it allows us
	 * to use more permissive allocation masks to be more stable under
1129
1130
	 * low memory condition and high memory pressure. In rare case,
	 * if not preloaded, GFP_NOWAIT is used.
1131
	 *
1132
	 * Set "pva" to NULL here, because of "retry" path.
1133
	 */
1134
	pva = NULL;
1135

1136
1137
1138
1139
1140
1141
	if (!this_cpu_read(ne_fit_preload_node))
		/*
		 * Even if it fails we do not really care about that.
		 * Just proceed as it is. If needed "overflow" path
		 * will refill the cache we allocate from.
		 */
1142
		pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1143

1144
	spin_lock(&free_vmap_area_lock);
1145
1146
1147

	if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
		kmem_cache_free(vmap_area_cachep, pva);
Nick Piggin's avatar
Nick Piggin committed
1148

1149
	/*
1150
1151
	 * If an allocation fails, the "vend" address is
	 * returned. Therefore trigger the overflow path.
1152
	 */
1153
	addr = __alloc_vmap_area(size, align, vstart, vend);
1154
1155
	spin_unlock(&free_vmap_area_lock);

1156
	if (unlikely(addr == vend))
Nick Piggin's avatar
Nick Piggin committed
1157
		goto overflow;
Nick Piggin's avatar
Nick Piggin committed
1158
1159
1160

	va->va_start = addr;
	va->va_end = addr + size;
1161
	va->vm = NULL;
1162

1163

1164
1165
	spin_lock(&vmap_area_lock);
	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
Nick Piggin's avatar
Nick Piggin committed
1166
1167
	spin_unlock(&vmap_area_lock);

1168
	BUG_ON(!IS_ALIGNED(va->va_start, align));
Nick Piggin's avatar
Nick Piggin committed
1169
1170
1171
	BUG_ON(va->va_start < vstart);
	BUG_ON(va->va_end > vend);

1172
1173
1174
1175
1176
1177
	ret = kasan_populate_vmalloc(addr, size);
	if (ret) {
		free_vmap_area(va);
		return ERR_PTR(ret);
	}

Nick Piggin's avatar
Nick Piggin committed
1178
	return va;
Nick Piggin's avatar
Nick Piggin committed
1179
1180
1181
1182
1183
1184
1185

overflow:
	if (!purged) {
		purge_vmap_area_lazy();
		purged = 1;
		goto retry;
	}
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195

	if (gfpflags_allow_blocking(gfp_mask)) {
		unsigned long freed = 0;
		blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
		if (freed > 0) {
			purged = 0;
			goto retry;
		}
	}

1196
	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
Joe Perches's avatar
Joe Perches committed
1197
1198
		pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
			size);
1199
1200

	kmem_cache_free(vmap_area_cachep, va);
Nick Piggin's avatar
Nick Piggin committed
1201
	return ERR_PTR(-EBUSY);
Nick Piggin's avatar
Nick Piggin committed
1202
1203
}

1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
int register_vmap_purge_notifier(struct notifier_block *nb)
{
	return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);

int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

Nick Piggin's avatar
Nick Piggin committed
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
/*
 * Clear the pagetable entries of a given vmap_area
 */
static void unmap_vmap_area(struct vmap_area *va)
{
	vunmap_page_range(va->va_start, va->va_end);
}

/*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
 *
 * There is a tradeoff here: a larger number will cover more kernel page tables
 * and take slightly longer to purge, but it will linearly reduce the number of
 * global TLB flushes that must be performed. It would seem natural to scale
 * this number up linearly with the number of CPUs (because vmapping activity
 * could also scale linearly with the number of CPUs), however it is likely
 * that in practice, workloads might be constrained in other ways that mean
 * vmap activity will not scale linearly with CPUs. Also, I want to be
 * conservative and not introduce a big latency on huge systems, so go with
 * a less aggressive log scale. It will still be an improvement over the old
 * code, and it will be simple to change the scale factor if we find that it
 * becomes a problem on bigger systems.
 */
static unsigned long lazy_max_pages(void)
{
	unsigned int log;

	log = fls(num_online_cpus());

	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}

1249
static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
Nick Piggin's avatar
Nick Piggin committed
1250

1251
1252
1253
1254
1255
/*
 * Serialize vmap purging.  There is no actual criticial section protected
 * by this look, but we want to avoid concurrent calls for performance
 * reasons and to make the pcpu_get_vm_areas more deterministic.
 */
1256
static DEFINE_MUTEX(vmap_purge_lock);
1257

1258
1259
1260
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);

1261
1262
1263
1264
1265
1266
/*
 * called before a call to iounmap() if the caller wants vm_area_struct's
 * immediately freed.
 */
void set_iounmap_nonlazy(void)
{
1267
	atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
1268
1269
}

Nick Piggin's avatar
Nick Piggin committed
1270
1271
1272
/*
 * Purges all lazily-freed vmap areas.
 */
1273
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
Nick Piggin's avatar
Nick Piggin committed
1274
{
1275
	unsigned long resched_threshold;
1276
	struct llist_node *valist;
Nick Piggin's avatar
Nick Piggin committed
1277
	struct vmap_area *va;
1278
	struct vmap_area *n_va;
Nick Piggin's avatar
Nick Piggin committed
1279

1280
	lockdep_assert_held(&vmap_purge_lock);
1281

1282
	valist = llist_del_all(&vmap_purge_list);
1283
1284
1285
	if (unlikely(valist == NULL))
		return false;

1286
1287
1288
1289
1290
1291
	/*
	 * First make sure the mappings are removed from all page-tables
	 * before they are freed.
	 */
	vmalloc_sync_all();

1292
1293
1294
1295
	/*
	 * TODO: to calculate a flush range without looping.
	 * The list can be up to lazy_max_pages() elements.
	 */
1296
	llist_for_each_entry(va, valist, purge_list) {
1297
1298
1299
1300
		if (va->va_start < start)
			start = va->va_start;
		if (va->va_end > end)
			end = va->va_end;
Nick Piggin's avatar
Nick Piggin committed
1301
1302
	}

1303
	flush_tlb_kernel_range(start, end);
1304
	resched_threshold = lazy_max_pages() << 1;
Nick Piggin's avatar
Nick Piggin committed
1305

1306
	spin_lock(&free_vmap_area_lock);
1307
	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
1308
		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1309
1310
		unsigned long orig_start = va->va_start;
		unsigned long orig_end = va->va_end;
1311

1312
1313
1314
1315
1316
		/*
		 * Finally insert or merge lazily-freed area. It is
		 * detached and there is no need to "unlink" it from
		 * anything.
		 */
1317
1318
1319
1320
1321
1322
		va = merge_or_add_vmap_area(va, &free_vmap_area_root,
					    &free_vmap_area_list);

		if (is_vmalloc_or_module_addr((void *)orig_start))
			kasan_release_vmalloc(orig_start, orig_end,
					      va->va_start, va->va_end);
1323

1324
		atomic_long_sub(nr, &vmap_lazy_nr);
1325

1326
		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
1327
			cond_resched_lock(&free_vmap_area_lock);
1328
	}
1329
	spin_unlock(&free_vmap_area_lock);
1330
	return true;
Nick Piggin's avatar
Nick Piggin committed
1331
1332
}

Nick Piggin's avatar
Nick Piggin committed
1333
1334
1335
1336
1337
1338
/*
 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
 * is already purging.
 */
static void try_purge_vmap_area_lazy(void)
{
1339
	if (mutex_trylock(&vmap_purge_lock)) {
1340
		__purge_vmap_area_lazy(ULONG_MAX, 0);
1341
		mutex_unlock(&vmap_purge_lock);
1342
	}
Nick Piggin's avatar
Nick Piggin committed
1343
1344
}

Nick Piggin's avatar
Nick Piggin committed
1345
1346
1347
1348
1349
/*
 * Kick off a purge of the outstanding lazy areas.
 */
static void purge_vmap_area_lazy(void)
{
1350
	mutex_lock(&vmap_purge_lock);
1351
1352
	purge_fragmented_blocks_allcpus();
	__purge_vmap_area_lazy(ULONG_MAX, 0);
1353
	mutex_unlock(&vmap_purge_lock);
Nick Piggin's avatar
Nick Piggin committed
1354
1355
1356
}

/*
1357
1358
1359
 * Free a vmap area, caller ensuring that the area has been unmapped
 * and flush_cache_vunmap had been called for the correct range
 * previously.
Nick Piggin's avatar
Nick Piggin committed
1360
 */
1361
static void free_vmap_area_noflush(struct vmap_area *va)
Nick Piggin's avatar
Nick Piggin committed
1362
{
1363
	unsigned long nr_lazy;
1364

1365
1366
1367
1368
	spin_lock(&vmap_area_lock);
	unlink_va(va, &vmap_area_root);
	spin_unlock(&vmap_area_lock);

1369
1370
	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
				PAGE_SHIFT, &vmap_lazy_nr);
1371
1372
1373
1374
1375

	/* After this point, we may free va at any time */
	llist_add(&va->purge_list, &vmap_purge_list);

	if (unlikely(nr_lazy > lazy_max_pages()))
Nick Piggin's avatar
Nick Piggin committed
1376
		try_purge_vmap_area_lazy();
Nick Piggin's avatar
Nick Piggin committed
1377
1378
}

1379
1380
1381
1382
1383
1384
/*
 * Free and unmap a vmap area
 */
static void free_unmap_vmap_area(struct vmap_area *va)
{
	flush_cache_vunmap(va->va_start, va->va_end);
1385
	unmap_vmap_area(va);
1386
1387
1388
	if (debug_pagealloc_enabled())
		flush_tlb_kernel_range(va->va_start, va->va_end);

1389
	free_vmap_area_noflush(va);
1390
1391
}

Nick Piggin's avatar
Nick Piggin committed
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
static struct vmap_area *find_vmap_area(unsigned long addr)
{
	struct vmap_area *va;

	spin_lock(&vmap_area_lock);
	va = __find_vmap_area(addr);
	spin_unlock(&vmap_area_lock);

	return va;
}

/*** Per cpu kva allocator ***/

/*
 * vmap space is limited especially on 32 bit architectures. Ensure there is
 * room for at least 16 percpu vmap blocks per CPU.
 */
/*
 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
 * instead (we just need a rough idea)
 */
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE		(128UL*1024*1024)
#else
#define VMALLOC_SPACE		(128UL*1024*1024*1024)
#endif

#define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
1426
1427
1428
1429
#define VMAP_BBMAP_BITS		\
		VMAP_MIN(VMAP_BBMAP_BITS_MAX,	\
		VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
			VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
Nick Piggin's avatar
Nick Piggin committed
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441

#define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)

struct vmap_block_queue {
	spinlock_t lock;
	struct list_head free;
};

struct vmap_block {
	spinlock_t lock;
	struct vmap_area *va;
	unsigned long free, dirty;
1442
	unsigned long dirty_min, dirty_max; /*< dirty range */
1443
1444
	struct list_head free_list;
	struct rcu_head rcu_head;
1445
	struct list_head purge;
Nick Piggin's avatar
Nick Piggin committed
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);

/*
 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
 * in the free path. Could get rid of this if we change the API to return a
 * "cookie" from alloc, to be passed to free. But no big deal yet.
 */
static DEFINE_SPINLOCK(vmap_block_tree_lock);
static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);

/*
 * We should probably have a fallback mechanism to allocate virtual memory
 * out of partially filled vmap blocks. However vmap block sizing should be
 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 * big problem.
 */

static unsigned long addr_to_vb_idx(unsigned long addr)
{
	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
	addr /= VMAP_BLOCK_SIZE;
	return addr;
}

1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
	unsigned long addr;

	addr = va_start + (pages_off << PAGE_SHIFT);
	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
	return (void *)addr;
}

/**
 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
 * @order:    how many 2^order pages should be occupied in newly allocated block
 * @gfp_mask: flags for the page level allocator
 *
1488
 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
1489
1490
 */
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
Nick Piggin's avatar
Nick Piggin committed
1491
1492
1493
1494
1495
1496
{
	struct vmap_block_queue *vbq;
	struct vmap_block *vb;
	struct vmap_area *va;
	unsigned long vb_idx;
	int node, err;
1497
	void *vaddr;
Nick Piggin's avatar
Nick Piggin committed
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508

	node = numa_node_id();

	vb = kmalloc_node(sizeof(struct vmap_block),
			gfp_mask & GFP_RECLAIM_MASK, node);
	if (unlikely(!vb))
		return ERR_PTR(-ENOMEM);

	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
					VMALLOC_START, VMALLOC_END,
					node, gfp_mask);
1509
	if (IS_ERR(va)) {
Nick Piggin's avatar
Nick Piggin committed
1510
		kfree(vb);
Julia Lawall's avatar
Julia Lawall committed
1511
		return ERR_CAST(va);
Nick Piggin's avatar
Nick Piggin committed
1512
1513
1514
1515
1516
1517
1518
1519
1520
	}

	err = radix_tree_preload(gfp_mask);
	if (unlikely(err)) {
		kfree(vb);
		free_vmap_area(va);
		return ERR_PTR(err);
	}

1521
	vaddr = vmap_block_vaddr(va->va_start, 0);
Nick Piggin's avatar
Nick Piggin committed
1522
1523
	spin_lock_init(&vb->lock);
	vb->va = va;
Roman Pen's avatar