vmalloc.c 69.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
/*
 *  linux/mm/vmalloc.c
 *
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
Christoph Lameter's avatar
Christoph Lameter committed
8
 *  Numa awareness, Christoph Lameter, SGI, June 2005
Linus Torvalds's avatar
Linus Torvalds committed
9
10
 */

Nick Piggin's avatar
Nick Piggin committed
11
#include <linux/vmalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
12
13
14
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
15
#include <linux/sched/signal.h>
Linus Torvalds's avatar
Linus Torvalds committed
16
17
18
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
19
#include <linux/proc_fs.h>
20
#include <linux/seq_file.h>
21
#include <linux/debugobjects.h>
22
#include <linux/kallsyms.h>
Nick Piggin's avatar
Nick Piggin committed
23
#include <linux/list.h>
24
#include <linux/notifier.h>
Nick Piggin's avatar
Nick Piggin committed
25
26
27
#include <linux/rbtree.h>
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
28
#include <linux/pfn.h>
29
#include <linux/kmemleak.h>
Arun Sharma's avatar
Arun Sharma committed
30
#include <linux/atomic.h>
31
#include <linux/compiler.h>
32
#include <linux/llist.h>
33
#include <linux/bitops.h>
34

35
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
36
#include <asm/tlbflush.h>
37
#include <asm/shmparam.h>
Linus Torvalds's avatar
Linus Torvalds committed
38

39
40
#include "internal.h"

41
42
43
44
45
46
47
48
49
50
51
struct vfree_deferred {
	struct llist_head list;
	struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

static void __vunmap(const void *, int);

static void free_work(struct work_struct *w)
{
	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
52
53
54
55
	struct llist_node *t, *llnode;

	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
		__vunmap((void *)llnode, 1);
56
57
}

Nick Piggin's avatar
Nick Piggin committed
58
/*** Page table manipulation functions ***/
59

Linus Torvalds's avatar
Linus Torvalds committed
60
61
62
63
64
65
66
67
68
69
70
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
{
	pte_t *pte;

	pte = pte_offset_kernel(pmd, addr);
	do {
		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
	} while (pte++, addr += PAGE_SIZE, addr != end);
}

Nick Piggin's avatar
Nick Piggin committed
71
static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
72
73
74
75
76
77
78
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
79
80
		if (pmd_clear_huge(pmd))
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
81
82
83
84
85
86
		if (pmd_none_or_clear_bad(pmd))
			continue;
		vunmap_pte_range(pmd, addr, next);
	} while (pmd++, addr = next, addr != end);
}

87
static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
88
89
90
91
{
	pud_t *pud;
	unsigned long next;

92
	pud = pud_offset(p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
93
94
	do {
		next = pud_addr_end(addr, end);
95
96
		if (pud_clear_huge(pud))
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
97
98
99
100
101
102
		if (pud_none_or_clear_bad(pud))
			continue;
		vunmap_pmd_range(pud, addr, next);
	} while (pud++, addr = next, addr != end);
}

103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
{
	p4d_t *p4d;
	unsigned long next;

	p4d = p4d_offset(pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_clear_huge(p4d))
			continue;
		if (p4d_none_or_clear_bad(p4d))
			continue;
		vunmap_pud_range(p4d, addr, next);
	} while (p4d++, addr = next, addr != end);
}

Nick Piggin's avatar
Nick Piggin committed
119
static void vunmap_page_range(unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
120
121
122
123
124
125
126
127
128
129
{
	pgd_t *pgd;
	unsigned long next;

	BUG_ON(addr >= end);
	pgd = pgd_offset_k(addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
130
		vunmap_p4d_range(pgd, addr, next);
Linus Torvalds's avatar
Linus Torvalds committed
131
132
133
134
	} while (pgd++, addr = next, addr != end);
}

static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
Nick Piggin's avatar
Nick Piggin committed
135
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
Linus Torvalds's avatar
Linus Torvalds committed
136
137
138
{
	pte_t *pte;

Nick Piggin's avatar
Nick Piggin committed
139
140
141
142
143
	/*
	 * nr is a running index into the array which helps higher level
	 * callers keep track of where we're up to.
	 */

144
	pte = pte_alloc_kernel(pmd, addr);
Linus Torvalds's avatar
Linus Torvalds committed
145
146
147
	if (!pte)
		return -ENOMEM;
	do {
Nick Piggin's avatar
Nick Piggin committed
148
149
150
151
152
		struct page *page = pages[*nr];

		if (WARN_ON(!pte_none(*pte)))
			return -EBUSY;
		if (WARN_ON(!page))
Linus Torvalds's avatar
Linus Torvalds committed
153
154
			return -ENOMEM;
		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
Nick Piggin's avatar
Nick Piggin committed
155
		(*nr)++;
Linus Torvalds's avatar
Linus Torvalds committed
156
157
158
159
	} while (pte++, addr += PAGE_SIZE, addr != end);
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
160
161
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
Linus Torvalds's avatar
Linus Torvalds committed
162
163
164
165
166
167
168
169
170
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_alloc(&init_mm, pud, addr);
	if (!pmd)
		return -ENOMEM;
	do {
		next = pmd_addr_end(addr, end);
Nick Piggin's avatar
Nick Piggin committed
171
		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
Linus Torvalds's avatar
Linus Torvalds committed
172
173
174
175
176
			return -ENOMEM;
	} while (pmd++, addr = next, addr != end);
	return 0;
}

177
static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
Nick Piggin's avatar
Nick Piggin committed
178
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
Linus Torvalds's avatar
Linus Torvalds committed
179
180
181
182
{
	pud_t *pud;
	unsigned long next;

183
	pud = pud_alloc(&init_mm, p4d, addr);
Linus Torvalds's avatar
Linus Torvalds committed
184
185
186
187
	if (!pud)
		return -ENOMEM;
	do {
		next = pud_addr_end(addr, end);
Nick Piggin's avatar
Nick Piggin committed
188
		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
Linus Torvalds's avatar
Linus Torvalds committed
189
190
191
192
193
			return -ENOMEM;
	} while (pud++, addr = next, addr != end);
	return 0;
}

194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
	p4d_t *p4d;
	unsigned long next;

	p4d = p4d_alloc(&init_mm, pgd, addr);
	if (!p4d)
		return -ENOMEM;
	do {
		next = p4d_addr_end(addr, end);
		if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
			return -ENOMEM;
	} while (p4d++, addr = next, addr != end);
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
211
212
213
214
215
216
/*
 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
 * will have pfns corresponding to the "pages" array.
 *
 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
 */
217
218
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
				   pgprot_t prot, struct page **pages)
Linus Torvalds's avatar
Linus Torvalds committed
219
220
221
{
	pgd_t *pgd;
	unsigned long next;
222
	unsigned long addr = start;
Nick Piggin's avatar
Nick Piggin committed
223
224
	int err = 0;
	int nr = 0;
Linus Torvalds's avatar
Linus Torvalds committed
225
226
227
228
229

	BUG_ON(addr >= end);
	pgd = pgd_offset_k(addr);
	do {
		next = pgd_addr_end(addr, end);
230
		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
Linus Torvalds's avatar
Linus Torvalds committed
231
		if (err)
232
			return err;
Linus Torvalds's avatar
Linus Torvalds committed
233
	} while (pgd++, addr = next, addr != end);
Nick Piggin's avatar
Nick Piggin committed
234
235

	return nr;
Linus Torvalds's avatar
Linus Torvalds committed
236
237
}

238
239
240
241
242
243
244
245
246
247
static int vmap_page_range(unsigned long start, unsigned long end,
			   pgprot_t prot, struct page **pages)
{
	int ret;

	ret = vmap_page_range_noflush(start, end, prot, pages);
	flush_cache_vmap(start, end);
	return ret;
}

248
int is_vmalloc_or_module_addr(const void *x)
249
250
{
	/*
251
	 * ARM, x86-64 and sparc64 put modules in a special place,
252
253
254
255
256
257
258
259
260
261
262
	 * and fall back on vmalloc() if that fails. Others
	 * just put it in the vmalloc space.
	 */
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
	unsigned long addr = (unsigned long)x;
	if (addr >= MODULES_VADDR && addr < MODULES_END)
		return 1;
#endif
	return is_vmalloc_addr(x);
}

263
/*
264
 * Walk a vmap address to the struct page it maps.
265
 */
266
struct page *vmalloc_to_page(const void *vmalloc_addr)
267
268
{
	unsigned long addr = (unsigned long) vmalloc_addr;
269
	struct page *page = NULL;
270
	pgd_t *pgd = pgd_offset_k(addr);
271
272
273
274
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep, pte;
275

276
277
278
279
	/*
	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
	 * architectures that do not vmalloc module space
	 */
280
	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
Jiri Slaby's avatar
Jiri Slaby committed
281

282
283
284
285
286
287
	if (pgd_none(*pgd))
		return NULL;
	p4d = p4d_offset(pgd, addr);
	if (p4d_none(*p4d))
		return NULL;
	pud = pud_offset(p4d, addr);
288
289
290
291
292
293
294
295
296
297
298

	/*
	 * Don't dereference bad PUD or PMD (below) entries. This will also
	 * identify huge mappings, which we may encounter on architectures
	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
	 * not [unambiguously] associated with a struct page, so there is
	 * no correct value to return for them.
	 */
	WARN_ON_ONCE(pud_bad(*pud));
	if (pud_none(*pud) || pud_bad(*pud))
299
300
		return NULL;
	pmd = pmd_offset(pud, addr);
301
302
	WARN_ON_ONCE(pmd_bad(*pmd));
	if (pmd_none(*pmd) || pmd_bad(*pmd))
303
304
305
306
307
308
309
		return NULL;

	ptep = pte_offset_map(pmd, addr);
	pte = *ptep;
	if (pte_present(pte))
		page = pte_page(pte);
	pte_unmap(ptep);
310
	return page;
311
}
312
EXPORT_SYMBOL(vmalloc_to_page);
313
314

/*
315
 * Map a vmalloc()-space virtual address to the physical page frame number.
316
 */
317
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
318
{
319
	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
320
}
321
EXPORT_SYMBOL(vmalloc_to_pfn);
322

Nick Piggin's avatar
Nick Piggin committed
323
324
325

/*** Global kva allocator ***/

326
#define VM_LAZY_FREE	0x02
Nick Piggin's avatar
Nick Piggin committed
327
328
329
#define VM_VM_AREA	0x04

static DEFINE_SPINLOCK(vmap_area_lock);
330
331
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
332
static LLIST_HEAD(vmap_purge_list);
Nick Piggin's avatar
Nick Piggin committed
333
334
335
336
337
338
339
340
static struct rb_root vmap_area_root = RB_ROOT;

/* The vmap cache globals are protected by vmap_area_lock */
static struct rb_node *free_vmap_cache;
static unsigned long cached_hole_size;
static unsigned long cached_vstart;
static unsigned long cached_align;

341
static unsigned long vmap_area_pcpu_hole;
Nick Piggin's avatar
Nick Piggin committed
342
343

static struct vmap_area *__find_vmap_area(unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
344
{
Nick Piggin's avatar
Nick Piggin committed
345
346
347
348
349
350
351
352
	struct rb_node *n = vmap_area_root.rb_node;

	while (n) {
		struct vmap_area *va;

		va = rb_entry(n, struct vmap_area, rb_node);
		if (addr < va->va_start)
			n = n->rb_left;
353
		else if (addr >= va->va_end)
Nick Piggin's avatar
Nick Piggin committed
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
			n = n->rb_right;
		else
			return va;
	}

	return NULL;
}

static void __insert_vmap_area(struct vmap_area *va)
{
	struct rb_node **p = &vmap_area_root.rb_node;
	struct rb_node *parent = NULL;
	struct rb_node *tmp;

	while (*p) {
369
		struct vmap_area *tmp_va;
Nick Piggin's avatar
Nick Piggin committed
370
371

		parent = *p;
372
373
		tmp_va = rb_entry(parent, struct vmap_area, rb_node);
		if (va->va_start < tmp_va->va_end)
Nick Piggin's avatar
Nick Piggin committed
374
			p = &(*p)->rb_left;
375
		else if (va->va_end > tmp_va->va_start)
Nick Piggin's avatar
Nick Piggin committed
376
377
378
379
380
381
382
383
			p = &(*p)->rb_right;
		else
			BUG();
	}

	rb_link_node(&va->rb_node, parent, p);
	rb_insert_color(&va->rb_node, &vmap_area_root);

384
	/* address-sort this list */
Nick Piggin's avatar
Nick Piggin committed
385
386
387
388
389
390
391
392
393
394
395
	tmp = rb_prev(&va->rb_node);
	if (tmp) {
		struct vmap_area *prev;
		prev = rb_entry(tmp, struct vmap_area, rb_node);
		list_add_rcu(&va->list, &prev->list);
	} else
		list_add_rcu(&va->list, &vmap_area_list);
}

static void purge_vmap_area_lazy(void);

396
397
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);

Nick Piggin's avatar
Nick Piggin committed
398
399
400
401
402
403
404
405
406
407
408
/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
				unsigned long align,
				unsigned long vstart, unsigned long vend,
				int node, gfp_t gfp_mask)
{
	struct vmap_area *va;
	struct rb_node *n;
Linus Torvalds's avatar
Linus Torvalds committed
409
	unsigned long addr;
Nick Piggin's avatar
Nick Piggin committed
410
	int purged = 0;
Nick Piggin's avatar
Nick Piggin committed
411
	struct vmap_area *first;
Nick Piggin's avatar
Nick Piggin committed
412

Nick Piggin's avatar
Nick Piggin committed
413
	BUG_ON(!size);
414
	BUG_ON(offset_in_page(size));
Nick Piggin's avatar
Nick Piggin committed
415
	BUG_ON(!is_power_of_2(align));
Nick Piggin's avatar
Nick Piggin committed
416

417
	might_sleep();
418

Nick Piggin's avatar
Nick Piggin committed
419
420
421
422
423
	va = kmalloc_node(sizeof(struct vmap_area),
			gfp_mask & GFP_RECLAIM_MASK, node);
	if (unlikely(!va))
		return ERR_PTR(-ENOMEM);

424
425
426
427
428
429
	/*
	 * Only scan the relevant parts containing pointers to other objects
	 * to avoid false negatives.
	 */
	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);

Nick Piggin's avatar
Nick Piggin committed
430
431
retry:
	spin_lock(&vmap_area_lock);
Nick Piggin's avatar
Nick Piggin committed
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
	/*
	 * Invalidate cache if we have more permissive parameters.
	 * cached_hole_size notes the largest hole noticed _below_
	 * the vmap_area cached in free_vmap_cache: if size fits
	 * into that hole, we want to scan from vstart to reuse
	 * the hole instead of allocating above free_vmap_cache.
	 * Note that __free_vmap_area may update free_vmap_cache
	 * without updating cached_hole_size or cached_align.
	 */
	if (!free_vmap_cache ||
			size < cached_hole_size ||
			vstart < cached_vstart ||
			align < cached_align) {
nocache:
		cached_hole_size = 0;
		free_vmap_cache = NULL;
	}
	/* record if we encounter less permissive parameters */
	cached_vstart = vstart;
	cached_align = align;

	/* find starting point for our search */
	if (free_vmap_cache) {
		first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
456
		addr = ALIGN(first->va_end, align);
Nick Piggin's avatar
Nick Piggin committed
457
458
		if (addr < vstart)
			goto nocache;
459
		if (addr + size < addr)
Nick Piggin's avatar
Nick Piggin committed
460
461
462
463
			goto overflow;

	} else {
		addr = ALIGN(vstart, align);
464
		if (addr + size < addr)
Nick Piggin's avatar
Nick Piggin committed
465
466
467
468
469
470
			goto overflow;

		n = vmap_area_root.rb_node;
		first = NULL;

		while (n) {
Nick Piggin's avatar
Nick Piggin committed
471
472
473
474
			struct vmap_area *tmp;
			tmp = rb_entry(n, struct vmap_area, rb_node);
			if (tmp->va_end >= addr) {
				first = tmp;
Nick Piggin's avatar
Nick Piggin committed
475
476
477
478
				if (tmp->va_start <= addr)
					break;
				n = n->rb_left;
			} else
Nick Piggin's avatar
Nick Piggin committed
479
				n = n->rb_right;
Nick Piggin's avatar
Nick Piggin committed
480
		}
Nick Piggin's avatar
Nick Piggin committed
481
482
483
484

		if (!first)
			goto found;
	}
Nick Piggin's avatar
Nick Piggin committed
485
486

	/* from the starting point, walk areas until a suitable hole is found */
487
	while (addr + size > first->va_start && addr + size <= vend) {
Nick Piggin's avatar
Nick Piggin committed
488
489
		if (addr + cached_hole_size < first->va_start)
			cached_hole_size = first->va_start - addr;
490
		addr = ALIGN(first->va_end, align);
491
		if (addr + size < addr)
Nick Piggin's avatar
Nick Piggin committed
492
493
			goto overflow;

494
		if (list_is_last(&first->list, &vmap_area_list))
Nick Piggin's avatar
Nick Piggin committed
495
			goto found;
496

497
		first = list_next_entry(first, list);
Nick Piggin's avatar
Nick Piggin committed
498
499
	}

Nick Piggin's avatar
Nick Piggin committed
500
501
502
found:
	if (addr + size > vend)
		goto overflow;
Nick Piggin's avatar
Nick Piggin committed
503
504
505
506
507

	va->va_start = addr;
	va->va_end = addr + size;
	va->flags = 0;
	__insert_vmap_area(va);
Nick Piggin's avatar
Nick Piggin committed
508
	free_vmap_cache = &va->rb_node;
Nick Piggin's avatar
Nick Piggin committed
509
510
	spin_unlock(&vmap_area_lock);

511
	BUG_ON(!IS_ALIGNED(va->va_start, align));
Nick Piggin's avatar
Nick Piggin committed
512
513
514
	BUG_ON(va->va_start < vstart);
	BUG_ON(va->va_end > vend);

Nick Piggin's avatar
Nick Piggin committed
515
	return va;
Nick Piggin's avatar
Nick Piggin committed
516
517
518
519
520
521
522
523

overflow:
	spin_unlock(&vmap_area_lock);
	if (!purged) {
		purge_vmap_area_lazy();
		purged = 1;
		goto retry;
	}
524
525
526
527
528
529
530
531
532
533

	if (gfpflags_allow_blocking(gfp_mask)) {
		unsigned long freed = 0;
		blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
		if (freed > 0) {
			purged = 0;
			goto retry;
		}
	}

534
	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
Joe Perches's avatar
Joe Perches committed
535
536
		pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
			size);
Nick Piggin's avatar
Nick Piggin committed
537
538
	kfree(va);
	return ERR_PTR(-EBUSY);
Nick Piggin's avatar
Nick Piggin committed
539
540
}

541
542
543
544
545
546
547
548
549
550
551
552
int register_vmap_purge_notifier(struct notifier_block *nb)
{
	return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);

int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

Nick Piggin's avatar
Nick Piggin committed
553
554
555
static void __free_vmap_area(struct vmap_area *va)
{
	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
Nick Piggin's avatar
Nick Piggin committed
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571

	if (free_vmap_cache) {
		if (va->va_end < cached_vstart) {
			free_vmap_cache = NULL;
		} else {
			struct vmap_area *cache;
			cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
			if (va->va_start <= cache->va_start) {
				free_vmap_cache = rb_prev(&va->rb_node);
				/*
				 * We don't try to update cached_hole_size or
				 * cached_align, but it won't go very wrong.
				 */
			}
		}
	}
Nick Piggin's avatar
Nick Piggin committed
572
573
574
575
	rb_erase(&va->rb_node, &vmap_area_root);
	RB_CLEAR_NODE(&va->rb_node);
	list_del_rcu(&va->list);

576
577
578
579
580
581
582
583
584
	/*
	 * Track the highest possible candidate for pcpu area
	 * allocation.  Areas outside of vmalloc area can be returned
	 * here too, consider only end addresses which fall inside
	 * vmalloc area proper.
	 */
	if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
		vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);

585
	kfree_rcu(va, rcu_head);
Nick Piggin's avatar
Nick Piggin committed
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
}

/*
 * Free a region of KVA allocated by alloc_vmap_area
 */
static void free_vmap_area(struct vmap_area *va)
{
	spin_lock(&vmap_area_lock);
	__free_vmap_area(va);
	spin_unlock(&vmap_area_lock);
}

/*
 * Clear the pagetable entries of a given vmap_area
 */
static void unmap_vmap_area(struct vmap_area *va)
{
	vunmap_page_range(va->va_start, va->va_end);
}

/*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
 *
 * There is a tradeoff here: a larger number will cover more kernel page tables
 * and take slightly longer to purge, but it will linearly reduce the number of
 * global TLB flushes that must be performed. It would seem natural to scale
 * this number up linearly with the number of CPUs (because vmapping activity
 * could also scale linearly with the number of CPUs), however it is likely
 * that in practice, workloads might be constrained in other ways that mean
 * vmap activity will not scale linearly with CPUs. Also, I want to be
 * conservative and not introduce a big latency on huge systems, so go with
 * a less aggressive log scale. It will still be an improvement over the old
 * code, and it will be simple to change the scale factor if we find that it
 * becomes a problem on bigger systems.
 */
static unsigned long lazy_max_pages(void)
{
	unsigned int log;

	log = fls(num_online_cpus());

	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}

static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);

633
634
635
636
637
/*
 * Serialize vmap purging.  There is no actual criticial section protected
 * by this look, but we want to avoid concurrent calls for performance
 * reasons and to make the pcpu_get_vm_areas more deterministic.
 */
638
static DEFINE_MUTEX(vmap_purge_lock);
639

640
641
642
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);

643
644
645
646
647
648
649
650
651
/*
 * called before a call to iounmap() if the caller wants vm_area_struct's
 * immediately freed.
 */
void set_iounmap_nonlazy(void)
{
	atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
}

Nick Piggin's avatar
Nick Piggin committed
652
653
654
/*
 * Purges all lazily-freed vmap areas.
 */
655
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
Nick Piggin's avatar
Nick Piggin committed
656
{
657
	struct llist_node *valist;
Nick Piggin's avatar
Nick Piggin committed
658
	struct vmap_area *va;
659
	struct vmap_area *n_va;
660
	bool do_free = false;
Nick Piggin's avatar
Nick Piggin committed
661

662
	lockdep_assert_held(&vmap_purge_lock);
663

664
665
	valist = llist_del_all(&vmap_purge_list);
	llist_for_each_entry(va, valist, purge_list) {
666
667
668
669
		if (va->va_start < start)
			start = va->va_start;
		if (va->va_end > end)
			end = va->va_end;
670
		do_free = true;
Nick Piggin's avatar
Nick Piggin committed
671
672
	}

673
	if (!do_free)
674
		return false;
Nick Piggin's avatar
Nick Piggin committed
675

676
	flush_tlb_kernel_range(start, end);
Nick Piggin's avatar
Nick Piggin committed
677

678
	spin_lock(&vmap_area_lock);
679
680
681
	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
		int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;

682
		__free_vmap_area(va);
683
684
685
		atomic_sub(nr, &vmap_lazy_nr);
		cond_resched_lock(&vmap_area_lock);
	}
686
687
	spin_unlock(&vmap_area_lock);
	return true;
Nick Piggin's avatar
Nick Piggin committed
688
689
}

Nick Piggin's avatar
Nick Piggin committed
690
691
692
693
694
695
/*
 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
 * is already purging.
 */
static void try_purge_vmap_area_lazy(void)
{
696
	if (mutex_trylock(&vmap_purge_lock)) {
697
		__purge_vmap_area_lazy(ULONG_MAX, 0);
698
		mutex_unlock(&vmap_purge_lock);
699
	}
Nick Piggin's avatar
Nick Piggin committed
700
701
}

Nick Piggin's avatar
Nick Piggin committed
702
703
704
705
706
/*
 * Kick off a purge of the outstanding lazy areas.
 */
static void purge_vmap_area_lazy(void)
{
707
	mutex_lock(&vmap_purge_lock);
708
709
	purge_fragmented_blocks_allcpus();
	__purge_vmap_area_lazy(ULONG_MAX, 0);
710
	mutex_unlock(&vmap_purge_lock);
Nick Piggin's avatar
Nick Piggin committed
711
712
713
}

/*
714
715
716
 * Free a vmap area, caller ensuring that the area has been unmapped
 * and flush_cache_vunmap had been called for the correct range
 * previously.
Nick Piggin's avatar
Nick Piggin committed
717
 */
718
static void free_vmap_area_noflush(struct vmap_area *va)
Nick Piggin's avatar
Nick Piggin committed
719
{
720
721
722
723
724
725
726
727
728
	int nr_lazy;

	nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
				    &vmap_lazy_nr);

	/* After this point, we may free va at any time */
	llist_add(&va->purge_list, &vmap_purge_list);

	if (unlikely(nr_lazy > lazy_max_pages()))
Nick Piggin's avatar
Nick Piggin committed
729
		try_purge_vmap_area_lazy();
Nick Piggin's avatar
Nick Piggin committed
730
731
}

732
733
734
735
736
737
/*
 * Free and unmap a vmap area
 */
static void free_unmap_vmap_area(struct vmap_area *va)
{
	flush_cache_vunmap(va->va_start, va->va_end);
738
	unmap_vmap_area(va);
739
740
741
	if (debug_pagealloc_enabled())
		flush_tlb_kernel_range(va->va_start, va->va_end);

742
	free_vmap_area_noflush(va);
743
744
}

Nick Piggin's avatar
Nick Piggin committed
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
static struct vmap_area *find_vmap_area(unsigned long addr)
{
	struct vmap_area *va;

	spin_lock(&vmap_area_lock);
	va = __find_vmap_area(addr);
	spin_unlock(&vmap_area_lock);

	return va;
}

/*** Per cpu kva allocator ***/

/*
 * vmap space is limited especially on 32 bit architectures. Ensure there is
 * room for at least 16 percpu vmap blocks per CPU.
 */
/*
 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
 * instead (we just need a rough idea)
 */
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE		(128UL*1024*1024)
#else
#define VMALLOC_SPACE		(128UL*1024*1024*1024)
#endif

#define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
779
780
781
782
#define VMAP_BBMAP_BITS		\
		VMAP_MIN(VMAP_BBMAP_BITS_MAX,	\
		VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
			VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
Nick Piggin's avatar
Nick Piggin committed
783
784
785

#define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)

786
787
static bool vmap_initialized __read_mostly = false;

Nick Piggin's avatar
Nick Piggin committed
788
789
790
791
792
793
794
795
796
struct vmap_block_queue {
	spinlock_t lock;
	struct list_head free;
};

struct vmap_block {
	spinlock_t lock;
	struct vmap_area *va;
	unsigned long free, dirty;
797
	unsigned long dirty_min, dirty_max; /*< dirty range */
798
799
	struct list_head free_list;
	struct rcu_head rcu_head;
800
	struct list_head purge;
Nick Piggin's avatar
Nick Piggin committed
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);

/*
 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
 * in the free path. Could get rid of this if we change the API to return a
 * "cookie" from alloc, to be passed to free. But no big deal yet.
 */
static DEFINE_SPINLOCK(vmap_block_tree_lock);
static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);

/*
 * We should probably have a fallback mechanism to allocate virtual memory
 * out of partially filled vmap blocks. However vmap block sizing should be
 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 * big problem.
 */

static unsigned long addr_to_vb_idx(unsigned long addr)
{
	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
	addr /= VMAP_BLOCK_SIZE;
	return addr;
}

828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
	unsigned long addr;

	addr = va_start + (pages_off << PAGE_SHIFT);
	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
	return (void *)addr;
}

/**
 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
 * @order:    how many 2^order pages should be occupied in newly allocated block
 * @gfp_mask: flags for the page level allocator
 *
 * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
 */
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
Nick Piggin's avatar
Nick Piggin committed
846
847
848
849
850
851
{
	struct vmap_block_queue *vbq;
	struct vmap_block *vb;
	struct vmap_area *va;
	unsigned long vb_idx;
	int node, err;
852
	void *vaddr;
Nick Piggin's avatar
Nick Piggin committed
853
854
855
856
857
858
859
860
861
862
863

	node = numa_node_id();

	vb = kmalloc_node(sizeof(struct vmap_block),
			gfp_mask & GFP_RECLAIM_MASK, node);
	if (unlikely(!vb))
		return ERR_PTR(-ENOMEM);

	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
					VMALLOC_START, VMALLOC_END,
					node, gfp_mask);
864
	if (IS_ERR(va)) {
Nick Piggin's avatar
Nick Piggin committed
865
		kfree(vb);
Julia Lawall's avatar
Julia Lawall committed
866
		return ERR_CAST(va);
Nick Piggin's avatar
Nick Piggin committed
867
868
869
870
871
872
873
874
875
	}

	err = radix_tree_preload(gfp_mask);
	if (unlikely(err)) {
		kfree(vb);
		free_vmap_area(va);
		return ERR_PTR(err);
	}

876
	vaddr = vmap_block_vaddr(va->va_start, 0);
Nick Piggin's avatar
Nick Piggin committed
877
878
	spin_lock_init(&vb->lock);
	vb->va = va;
879
880
881
	/* At least something should be left free */
	BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
	vb->free = VMAP_BBMAP_BITS - (1UL << order);
Nick Piggin's avatar
Nick Piggin committed
882
	vb->dirty = 0;
883
884
	vb->dirty_min = VMAP_BBMAP_BITS;
	vb->dirty_max = 0;
Nick Piggin's avatar
Nick Piggin committed
885
886
887
888
889
890
891
892
893
894
895
	INIT_LIST_HEAD(&vb->free_list);

	vb_idx = addr_to_vb_idx(va->va_start);
	spin_lock(&vmap_block_tree_lock);
	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
	spin_unlock(&vmap_block_tree_lock);
	BUG_ON(err);
	radix_tree_preload_end();

	vbq = &get_cpu_var(vmap_block_queue);
	spin_lock(&vbq->lock);
896
	list_add_tail_rcu(&vb->free_list, &vbq->free);
Nick Piggin's avatar
Nick Piggin committed
897
	spin_unlock(&vbq->lock);
898
	put_cpu_var(vmap_block_queue);
Nick Piggin's avatar
Nick Piggin committed
899

900
	return vaddr;
Nick Piggin's avatar
Nick Piggin committed
901
902
903
904
905
906
907
908
909
910
911
912
913
}

static void free_vmap_block(struct vmap_block *vb)
{
	struct vmap_block *tmp;
	unsigned long vb_idx;

	vb_idx = addr_to_vb_idx(vb->va->va_start);
	spin_lock(&vmap_block_tree_lock);
	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
	spin_unlock(&vmap_block_tree_lock);
	BUG_ON(tmp != vb);

914
	free_vmap_area_noflush(vb->va);
915
	kfree_rcu(vb, rcu_head);
Nick Piggin's avatar
Nick Piggin committed
916
917
}

918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
static void purge_fragmented_blocks(int cpu)
{
	LIST_HEAD(purge);
	struct vmap_block *vb;
	struct vmap_block *n_vb;
	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);

	rcu_read_lock();
	list_for_each_entry_rcu(vb, &vbq->free, free_list) {

		if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
			continue;

		spin_lock(&vb->lock);
		if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
			vb->free = 0; /* prevent further allocs after releasing lock */
			vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
935
936
			vb->dirty_min = 0;
			vb->dirty_max = VMAP_BBMAP_BITS;
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
			spin_lock(&vbq->lock);
			list_del_rcu(&vb->free_list);
			spin_unlock(&vbq->lock);
			spin_unlock(&vb->lock);
			list_add_tail(&vb->purge, &purge);
		} else
			spin_unlock(&vb->lock);
	}
	rcu_read_unlock();

	list_for_each_entry_safe(vb, n_vb, &purge, purge) {
		list_del(&vb->purge);
		free_vmap_block(vb);
	}
}

static void purge_fragmented_blocks_allcpus(void)
{
	int cpu;

	for_each_possible_cpu(cpu)
		purge_fragmented_blocks(cpu);
}

Nick Piggin's avatar
Nick Piggin committed
961
962
963
964
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
	struct vmap_block_queue *vbq;
	struct vmap_block *vb;
965
	void *vaddr = NULL;
Nick Piggin's avatar
Nick Piggin committed
966
967
	unsigned int order;

968
	BUG_ON(offset_in_page(size));
Nick Piggin's avatar
Nick Piggin committed
969
	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
Jan Kara's avatar
Jan Kara committed
970
971
972
973
974
975
976
977
	if (WARN_ON(size == 0)) {
		/*
		 * Allocating 0 bytes isn't what caller wants since
		 * get_order(0) returns funny result. Just warn and terminate
		 * early.
		 */
		return NULL;
	}
Nick Piggin's avatar
Nick Piggin committed
978
979
980
981
982
	order = get_order(size);

	rcu_read_lock();
	vbq = &get_cpu_var(vmap_block_queue);
	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
983
		unsigned long pages_off;
Nick Piggin's avatar
Nick Piggin committed
984
985

		spin_lock(&vb->lock);
986
987
988
989
		if (vb->free < (1UL << order)) {
			spin_unlock(&vb->lock);
			continue;
		}
990

991
992
		pages_off = VMAP_BBMAP_BITS - vb->free;
		vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
993
994
995
996
997
998
		vb->free -= 1UL << order;
		if (vb->free == 0) {
			spin_lock(&vbq->lock);
			list_del_rcu(&vb->free_list);
			spin_unlock(&vbq->lock);
		}
999

1000
1001
		spin_unlock(&vb->lock);
		break;
Nick Piggin's avatar
Nick Piggin committed
1002
	}
1003

1004
	put_cpu_var(vmap_block_queue);
Nick Piggin's avatar
Nick Piggin committed
1005
1006
	rcu_read_unlock();

1007
1008
1009
	/* Allocate new block if nothing was found */
	if (!vaddr)
		vaddr = new_vmap_block(order, gfp_mask);
Nick Piggin's avatar
Nick Piggin committed
1010

1011
	return vaddr;
Nick Piggin's avatar
Nick Piggin committed
1012
1013
1014
1015
1016
1017
1018
1019
1020
}

static void vb_free(const void *addr, unsigned long size)
{
	unsigned long offset;
	unsigned long vb_idx;
	unsigned int order;
	struct vmap_block *vb;

1021
	BUG_ON(offset_in_page(size));
Nick Piggin's avatar
Nick Piggin committed
1022
	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
1023
1024
1025

	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);

Nick Piggin's avatar
Nick Piggin committed
1026
1027
1028
	order = get_order(size);

	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
1029
	offset >>= PAGE_SHIFT;
Nick Piggin's avatar
Nick Piggin committed
1030
1031
1032
1033
1034
1035
1036

	vb_idx = addr_to_vb_idx((unsigned long)addr);
	rcu_read_lock();
	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
	rcu_read_unlock();
	BUG_ON(!vb);

1037
1038
	vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);

1039
1040
1041
1042
	if (debug_pagealloc_enabled())
		flush_tlb_kernel_range((unsigned long)addr,
					(unsigned long)addr + size);

Nick Piggin's avatar
Nick Piggin committed
1043
	spin_lock(&vb->lock);
1044
1045
1046
1047

	/* Expand dirty range */
	vb->dirty_min = min(vb->dirty_min, offset);
	vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
1048

Nick Piggin's avatar
Nick Piggin committed
1049
1050
	vb->dirty += 1UL << order;
	if (vb->dirty == VMAP_BBMAP_BITS) {
1051
		BUG_ON(vb->free);
Nick Piggin's avatar
Nick Piggin committed
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
		spin_unlock(&vb->lock);
		free_vmap_block(vb);
	} else
		spin_unlock(&vb->lock);
}

/**
 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 *
 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 * to amortize TLB flushing overheads. What this means is that any page you
 * have now, may, in a former life, have been mapped into kernel virtual
 * address by the vmap layer and so there might be some CPUs with TLB entries
 * still referencing that page (additional to the regular 1:1 kernel mapping).
 *
 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 * be sure that none of the pages we have control over will have any aliases
 * from the vmap layer.
 */
void vm_unmap_aliases(void)
{
	unsigned long start = ULONG_MAX, end = 0;
	int cpu;
	int flush = 0;

1077
1078
1079
	if (unlikely(!vmap_initialized))
		return;

1080
1081
	might_sleep();

Nick Piggin's avatar
Nick Piggin committed
1082
1083
1084
1085
1086
1087
1088
	for_each_possible_cpu(cpu) {
		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
		struct vmap_block *vb;

		rcu_read_lock();
		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
			spin_lock(&vb->lock);
1089
1090
			if (vb->dirty) {
				unsigned long va_start = vb->va->va_start;
Nick Piggin's avatar
Nick Piggin committed
1091
				unsigned long s, e;
1092

1093
1094
				s = va_start + (vb->dirty_min << PAGE_SHIFT);
				e = va_start + (vb->dirty_max << PAGE_SHIFT);
Nick Piggin's avatar
Nick Piggin committed
1095

1096
1097
				start = min(s, start);
				end   = max(e, end);
Nick Piggin's avatar
Nick Piggin committed
1098

1099
				flush = 1;
Nick Piggin's avatar
Nick Piggin committed
1100
1101
1102
1103
1104
1105
			}
			spin_unlock(&vb->lock);
		}
		rcu_read_unlock();
	}

1106
	mutex_lock(&vmap_purge_lock);
1107
1108
1109
	purge_fragmented_blocks_allcpus();
	if (!__purge_vmap_area_lazy(start, end) && flush)
		flush_tlb_kernel_range(start, end);
1110
	mutex_unlock(&vmap_purge_lock);
Nick Piggin's avatar
Nick Piggin committed
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);

/**
 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 * @mem: the pointer returned by vm_map_ram
 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 */
void vm_unmap_ram(const void *mem, unsigned int count)
{
1121
	unsigned long size = (unsigned long)count << PAGE_SHIFT;
Nick Piggin's avatar
Nick Piggin committed
1122
	unsigned long addr = (unsigned long)mem;
1123
	struct vmap_area *va;
Nick Piggin's avatar
Nick Piggin committed
1124

1125
	might_sleep();
Nick Piggin's avatar
Nick Piggin committed
1126
1127
1128
	BUG_ON(!addr);
	BUG_ON(addr < VMALLOC_START);
	BUG_ON(addr > VMALLOC_END);
1129
	BUG_ON(!PAGE_ALIGNED(addr));
Nick Piggin's avatar
Nick Piggin committed
1130

1131
	if (likely(count <= VMAP_MAX_ALLOC)) {
1132
		debug_check_no_locks_freed(mem, size);
Nick Piggin's avatar
Nick Piggin committed
1133
		vb_free(mem, size);
1134
1135
1136
1137
1138
		return;
	}

	va = find_vmap_area(addr);
	BUG_ON(!va);
1139
1140
	debug_check_no_locks_freed((void *)va->va_start,
				    (va->va_end - va->va_start));
1141
	free_unmap_vmap_area(va);
Nick Piggin's avatar
Nick Piggin committed
1142
1143
1144
1145
1146
1147
1148
1149
1150
}
EXPORT_SYMBOL(vm_unmap_ram);

/**
 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 * @pages: an array of pointers to the pages to be mapped
 * @count: number of pages
 * @node: prefer to allocate data structures on this node
 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
1151
 *
1152
1153
1154
1155
1156
1157
 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
 * faster than vmap so it's good.  But if you mix long-life and short-life
 * objects with vm_map_ram(), it could consume lots of address space through
 * fragmentation (especially on a 32bit machine).  You could see failures in
 * the end.  Please use this function for short-lived objects.
 *
1158
 * Returns: a pointer to the address that has been mapped, or %NULL on failure
Nick Piggin's avatar
Nick Piggin committed
1159
1160
1161
 */
void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
{
1162
	unsigned long size = (unsigned long)count << PAGE_SHIFT;
Nick Piggin's avatar
Nick Piggin committed
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
	unsigned long addr;
	void *mem;

	if (likely(count <= VMAP_MAX_ALLOC)) {
		mem = vb_alloc(size, GFP_KERNEL);
		if (IS_ERR(mem))
			return NULL;
		addr = (unsigned long)mem;
	} else {
		struct vmap_area *va;
		va = alloc_vmap_area(size, PAGE_SIZE,
				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
		if (IS_ERR(va))
			return NULL;

		addr = va->va_start;
		mem = (void *)addr;
	}
	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
		vm_unmap_ram(mem, count);
		return NULL;
	}
	return mem;
}
EXPORT_SYMBOL(vm_map_ram);

1189
static struct vm_struct *vmlist __initdata;
Nicolas Pitre's avatar
Nicolas Pitre committed
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
/**
 * vm_area_add_early - add vmap area early during boot
 * @vm: vm_struct to add
 *
 * This function is used to add fixed kernel vm area to vmlist before
 * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
 * should contain proper values and the other fields should be zero.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_add_early(struct vm_struct *vm)
{
	struct vm_struct *tmp, **p;

	BUG_ON(vmap_initialized);
	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
		if (tmp->addr >= vm->addr) {
			BUG_ON(tmp->addr < vm->addr + vm->size);
			break;
		} else
			BUG_ON(tmp->addr + tmp->size > vm->addr);
	}
	vm->next = *p;
	*p = vm;
}

1216
1217
1218
/**
 * vm_area_register_early - register vmap area early during boot
 * @vm: vm_struct to register
1219
 * @align: requested alignment
1220
1221
1222
1223
1224
1225
1226
1227
 *
 * This function is used to register kernel vm area before
 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 * proper values on entry and other fields should be zero.  On return,
 * vm->addr contains the allocated address.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
1228
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
1229
1230
{
	static size_t vm_init_off __initdata;
1231
1232
1233
1234
	unsigned long addr;

	addr = ALIGN(VMALLOC_START + vm_init_off, align);
	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
1235

1236
	vm->addr = (void *)addr;
1237

Nicolas Pitre's avatar
Nicolas Pitre committed
1238
	vm_area_add_early(vm);
1239
1240
}

Nick Piggin's avatar
Nick Piggin committed
1241
1242
void __init vmalloc_init(void)
{
Ivan Kokshaysky's avatar
Ivan Kokshaysky committed
1243
1244
	struct vmap_area *va;
	struct vm_struct *tmp;
Nick Piggin's avatar
Nick Piggin committed
1245
1246
1247
1248
	int i;

	for_each_possible_cpu(i) {
		struct vmap_block_queue *vbq;
1249
		struct vfree_deferred *p;
Nick Piggin's avatar
Nick Piggin committed
1250
1251
1252
1253

		vbq = &per_cpu(vmap_block_queue, i);
		spin_lock_init(&vbq->lock);
		INIT_LIST_HEAD(&vbq->free);
1254
1255
1256
		p = &per_cpu(vfree_deferred, i);
		init_llist_head(&p->list);
		INIT_WORK(&p->wq, free_work);
Nick Piggin's avatar
Nick Piggin committed
1257
	}
1258

Ivan Kokshaysky's avatar
Ivan Kokshaysky committed
1259
1260
	/* Import existing vmlist entries. */
	for (tmp = vmlist; tmp; tmp = tmp->next) {
1261
		va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1262
		va->flags = VM_VM_AREA;
Ivan Kokshaysky's avatar
Ivan Kokshaysky committed
1263
1264
		va->va_start = (unsigned long)tmp->addr;
		va->va_end = va->va_start + tmp->size;
1265
		va->vm = tmp;
Ivan Kokshaysky's avatar