init_64.c 38.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
Pavel Machek's avatar
Pavel Machek committed
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
Linus Torvalds's avatar
Linus Torvalds committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
21
#include <linux/initrd.h>
Linus Torvalds's avatar
Linus Torvalds committed
22
23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30
#include <linux/memory.h>
31
#include <linux/memory_hotplug.h>
32
#include <linux/memremap.h>
33
#include <linux/nmi.h>
34
#include <linux/gfp.h>
35
#include <linux/kcore.h>
Linus Torvalds's avatar
Linus Torvalds committed
36
37

#include <asm/processor.h>
38
#include <asm/bios_ebda.h>
39
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
40
41
42
43
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
44
#include <asm/e820/api.h>
Linus Torvalds's avatar
Linus Torvalds committed
45
46
47
48
49
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
50
#include <asm/sections.h>
51
#include <asm/kdebug.h>
52
#include <asm/numa.h>
Laura Abbott's avatar
Laura Abbott committed
53
#include <asm/set_memory.h>
54
#include <asm/init.h>
55
#include <asm/uv/uv.h>
56
#include <asm/setup.h>
Linus Torvalds's avatar
Linus Torvalds committed
57

58
59
#include "mm_internal.h"

60
#include "ident_map.c"
61

Linus Torvalds's avatar
Linus Torvalds committed
62
63
64
65
66
67
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

68
/* Bits supported by the hardware: */
69
pteval_t __supported_pte_mask __read_mostly = ~0;
70
71
/* Bits allowed in normal kernel mappings: */
pteval_t __default_kernel_pte_mask __read_mostly = ~0;
72
EXPORT_SYMBOL_GPL(__supported_pte_mask);
73
74
/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
EXPORT_SYMBOL(__default_kernel_pte_mask);
75
76
77

int force_personality32;

Ingo Molnar's avatar
Ingo Molnar committed
78
79
80
81
82
83
84
85
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
86
87
88
89
90
91
92
93
94
95
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

96
static void sync_global_pgds_l5(unsigned long start, unsigned long end)
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
{
	unsigned long addr;

	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
		const pgd_t *pgd_ref = pgd_offset_k(addr);
		struct page *page;

		/* Check for overflow */
		if (addr < start)
			break;

		if (pgd_none(*pgd_ref))
			continue;

		spin_lock(&pgd_lock);
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
			spinlock_t *pgt_lock;

			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
			/* the pgt_lock only for Xen */
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

			if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));

			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);

			spin_unlock(pgt_lock);
		}
		spin_unlock(&pgd_lock);
	}
}
132
133

static void sync_global_pgds_l4(unsigned long start, unsigned long end)
134
{
135
	unsigned long addr;
136

137
138
	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
		pgd_t *pgd_ref = pgd_offset_k(addr);
139
		const p4d_t *p4d_ref;
140
141
		struct page *page;

142
143
144
145
		/*
		 * With folded p4d, pgd_none() is always false, we need to
		 * handle synchonization on p4d level.
		 */
146
		MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
147
		p4d_ref = p4d_offset(pgd_ref, addr);
148
149

		if (p4d_none(*p4d_ref))
150
151
			continue;

152
		spin_lock(&pgd_lock);
153
		list_for_each_entry(page, &pgd_list, lru) {
154
			pgd_t *pgd;
155
			p4d_t *p4d;
156
157
			spinlock_t *pgt_lock;

158
159
			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
			p4d = p4d_offset(pgd, addr);
160
			/* the pgt_lock only for Xen */
161
162
163
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

164
165
166
			if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
				BUG_ON(p4d_page_vaddr(*p4d)
				       != p4d_page_vaddr(*p4d_ref));
167

168
169
			if (p4d_none(*p4d))
				set_p4d(p4d, *p4d_ref);
170

171
			spin_unlock(pgt_lock);
172
		}
173
		spin_unlock(&pgd_lock);
174
	}
175
}
176
177
178
179
180
181
182

/*
 * When memory was added make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
183
	if (pgtable_l5_enabled())
184
185
186
187
		sync_global_pgds_l5(start, end);
	else
		sync_global_pgds_l4(start, end);
}
188

189
190
191
192
193
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
Thomas Gleixner's avatar
Thomas Gleixner committed
194
{
Linus Torvalds's avatar
Linus Torvalds committed
195
	void *ptr;
Thomas Gleixner's avatar
Thomas Gleixner committed
196

Linus Torvalds's avatar
Linus Torvalds committed
197
	if (after_bootmem)
198
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds's avatar
Linus Torvalds committed
199
200
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
Thomas Gleixner's avatar
Thomas Gleixner committed
201
202
203
204
205

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
Linus Torvalds's avatar
Linus Torvalds committed
206

207
	pr_debug("spp_getpage %p\n", ptr);
Thomas Gleixner's avatar
Thomas Gleixner committed
208

Linus Torvalds's avatar
Linus Torvalds committed
209
	return ptr;
Thomas Gleixner's avatar
Thomas Gleixner committed
210
}
Linus Torvalds's avatar
Linus Torvalds committed
211

212
static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
Linus Torvalds's avatar
Linus Torvalds committed
213
{
214
	if (pgd_none(*pgd)) {
215
216
217
		p4d_t *p4d = (p4d_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, p4d);
		if (p4d != p4d_offset(pgd, 0))
218
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
219
220
221
222
223
224
225
226
227
228
229
230
231
			       p4d, p4d_offset(pgd, 0));
	}
	return p4d_offset(pgd, vaddr);
}

static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
{
	if (p4d_none(*p4d)) {
		pud_t *pud = (pud_t *)spp_getpage();
		p4d_populate(&init_mm, p4d, pud);
		if (pud != pud_offset(p4d, 0))
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
			       pud, pud_offset(p4d, 0));
232
	}
233
	return pud_offset(p4d, vaddr);
234
}
Linus Torvalds's avatar
Linus Torvalds committed
235

236
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
237
{
Linus Torvalds's avatar
Linus Torvalds committed
238
	if (pud_none(*pud)) {
239
		pmd_t *pmd = (pmd_t *) spp_getpage();
240
		pud_populate(&init_mm, pud, pmd);
241
		if (pmd != pmd_offset(pud, 0))
242
			printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
243
			       pmd, pmd_offset(pud, 0));
Linus Torvalds's avatar
Linus Torvalds committed
244
	}
245
246
247
	return pmd_offset(pud, vaddr);
}

248
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
249
{
Linus Torvalds's avatar
Linus Torvalds committed
250
	if (pmd_none(*pmd)) {
251
		pte_t *pte = (pte_t *) spp_getpage();
252
		pmd_populate_kernel(&init_mm, pmd, pte);
253
		if (pte != pte_offset_kernel(pmd, 0))
254
			printk(KERN_ERR "PAGETABLE BUG #03!\n");
Linus Torvalds's avatar
Linus Torvalds committed
255
	}
256
257
258
	return pte_offset_kernel(pmd, vaddr);
}

259
static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
260
{
261
262
	pmd_t *pmd = fill_pmd(pud, vaddr);
	pte_t *pte = fill_pte(pmd, vaddr);
Linus Torvalds's avatar
Linus Torvalds committed
263
264
265
266
267
268
269

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
270
	__flush_tlb_one_kernel(vaddr);
Linus Torvalds's avatar
Linus Torvalds committed
271
272
}

273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
{
	p4d_t *p4d = p4d_page + p4d_index(vaddr);
	pud_t *pud = fill_pud(p4d, vaddr);

	__set_pte_vaddr(pud, vaddr, new_pte);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud = pud_page + pud_index(vaddr);

	__set_pte_vaddr(pud, vaddr, new_pte);
}

288
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
289
290
{
	pgd_t *pgd;
291
	p4d_t *p4d_page;
292
293
294
295
296
297
298
299
300

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
301
302
303

	p4d_page = p4d_offset(pgd, 0);
	set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
304
305
}

306
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
307
308
{
	pgd_t *pgd;
309
	p4d_t *p4d;
310
311
312
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
313
314
	p4d = fill_p4d(pgd, vaddr);
	pud = fill_pud(p4d, vaddr);
315
316
317
318
319
320
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
321

322
323
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
324
325
}

326
327
328
329
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
330
					enum page_cache_mode cache)
331
332
{
	pgd_t *pgd;
333
	p4d_t *p4d;
334
335
	pud_t *pud;
	pmd_t *pmd;
336
	pgprot_t prot;
337

338
339
	pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
		pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
340
341
342
343
	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
344
345
346
347
348
349
			p4d = (p4d_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		p4d = p4d_offset(pgd, (unsigned long)__va(phys));
		if (p4d_none(*p4d)) {
350
			pud = (pud_t *) spp_getpage();
351
			set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
352
353
						_PAGE_USER));
		}
354
		pud = pud_offset(p4d, (unsigned long)__va(phys));
355
356
357
358
359
360
361
362
363
364
365
366
367
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
368
	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
369
370
371
372
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
373
	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
374
375
}

376
/*
377
378
379
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
380
 *
381
 * phys_base holds the negative offset to the kernel, which is added
382
383
384
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
385
386
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
387
388
389
390
391
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
392
	unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
393
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
394
395
	pmd_t *pmd = level2_kernel_pgt;

396
397
398
399
400
401
402
403
	/*
	 * Native path, max_pfn_mapped is not set yet.
	 * Xen has valid max_pfn_mapped set in
	 *	arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
	 */
	if (max_pfn_mapped)
		vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);

404
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
405
		if (pmd_none(*pmd))
406
407
408
409
410
411
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

412
413
414
415
/*
 * Create PTE level page table mapping for physical addresses.
 * It returns the last physical address mapped.
 */
416
static unsigned long __meminit
417
phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
418
	      pgprot_t prot)
419
{
420
421
422
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
	pte_t *pte;
423
	int i;
424

425
426
	pte = pte_page + pte_index(paddr);
	i = pte_index(paddr);
427

428
429
430
	for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
		paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
		if (paddr >= paddr_end) {
431
			if (!after_bootmem &&
432
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
433
					     E820_TYPE_RAM) &&
434
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
435
					     E820_TYPE_RESERVED_KERN))
436
437
				set_pte(pte, __pte(0));
			continue;
438
439
		}

440
441
442
443
444
445
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
446
		if (!pte_none(*pte)) {
447
448
			if (!after_bootmem)
				pages++;
449
			continue;
450
		}
451
452

		if (0)
453
454
			pr_info("   pte=%p addr=%lx pte=%016lx\n", pte, paddr,
				pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
455
		pages++;
456
457
		set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
		paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
458
	}
459

460
	update_page_count(PG_LEVEL_4K, pages);
461

462
	return paddr_last;
463
464
}

465
466
467
468
469
/*
 * Create PMD level page table mapping for physical addresses. The virtual
 * and physical address have to be aligned at this level.
 * It returns the last physical address mapped.
 */
470
static unsigned long __meminit
471
phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
472
	      unsigned long page_size_mask, pgprot_t prot)
473
{
474
475
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
476

477
	int i = pmd_index(paddr);
478

479
480
	for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
		pmd_t *pmd = pmd_page + pmd_index(paddr);
481
		pte_t *pte;
482
		pgprot_t new_prot = prot;
483

484
485
		paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
		if (paddr >= paddr_end) {
486
			if (!after_bootmem &&
487
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
488
					     E820_TYPE_RAM) &&
489
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
490
					     E820_TYPE_RESERVED_KERN))
491
492
				set_pmd(pmd, __pmd(0));
			continue;
493
		}
494

495
		if (!pmd_none(*pmd)) {
496
497
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
498
				pte = (pte_t *)pmd_page_vaddr(*pmd);
499
500
				paddr_last = phys_pte_init(pte, paddr,
							   paddr_end, prot);
501
				spin_unlock(&init_mm.page_table_lock);
502
				continue;
503
			}
504
505
506
507
508
509
510
511
512
513
514
515
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
516
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
517
518
				if (!after_bootmem)
					pages++;
519
				paddr_last = paddr_next;
520
				continue;
521
			}
522
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
523
524
		}

525
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
526
			pages++;
527
			spin_lock(&init_mm.page_table_lock);
528
			set_pte((pte_t *)pmd,
529
				pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
530
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
531
			spin_unlock(&init_mm.page_table_lock);
532
			paddr_last = paddr_next;
533
			continue;
534
		}
535

536
		pte = alloc_low_page();
537
		paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
538

539
		spin_lock(&init_mm.page_table_lock);
540
		pmd_populate_kernel(&init_mm, pmd, pte);
541
		spin_unlock(&init_mm.page_table_lock);
542
	}
543
	update_page_count(PG_LEVEL_2M, pages);
544
	return paddr_last;
545
546
}

547
548
/*
 * Create PUD level page table mapping for physical addresses. The virtual
549
550
 * and physical address do not have to be aligned at this level. KASLR can
 * randomize virtual addresses up to this level.
551
552
 * It returns the last physical address mapped.
 */
553
static unsigned long __meminit
554
555
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
	      unsigned long page_size_mask)
Thomas Gleixner's avatar
Thomas Gleixner committed
556
{
557
558
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
559
560
	unsigned long vaddr = (unsigned long)__va(paddr);
	int i = pud_index(vaddr);
561

562
	for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
563
		pud_t *pud;
Linus Torvalds's avatar
Linus Torvalds committed
564
		pmd_t *pmd;
565
		pgprot_t prot = PAGE_KERNEL;
Linus Torvalds's avatar
Linus Torvalds committed
566

567
568
		vaddr = (unsigned long)__va(paddr);
		pud = pud_page + pud_index(vaddr);
569
		paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
570

571
		if (paddr >= paddr_end) {
572
			if (!after_bootmem &&
573
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
574
					     E820_TYPE_RAM) &&
575
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
576
					     E820_TYPE_RESERVED_KERN))
577
				set_pud(pud, __pud(0));
Linus Torvalds's avatar
Linus Torvalds committed
578
			continue;
Thomas Gleixner's avatar
Thomas Gleixner committed
579
		}
Linus Torvalds's avatar
Linus Torvalds committed
580

581
		if (!pud_none(*pud)) {
582
			if (!pud_large(*pud)) {
583
				pmd = pmd_offset(pud, 0);
584
585
586
587
				paddr_last = phys_pmd_init(pmd, paddr,
							   paddr_end,
							   page_size_mask,
							   prot);
588
				__flush_tlb_all();
589
590
				continue;
			}
591
592
593
594
595
596
597
598
599
600
601
602
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
603
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
604
605
				if (!after_bootmem)
					pages++;
606
				paddr_last = paddr_next;
607
				continue;
608
			}
609
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
610
611
		}

612
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
613
			pages++;
614
			spin_lock(&init_mm.page_table_lock);
615
			set_pte((pte_t *)pud,
616
				pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
617
					PAGE_KERNEL_LARGE));
618
			spin_unlock(&init_mm.page_table_lock);
619
			paddr_last = paddr_next;
620
621
622
			continue;
		}

623
		pmd = alloc_low_page();
624
625
		paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
					   page_size_mask, prot);
626
627

		spin_lock(&init_mm.page_table_lock);
628
		pud_populate(&init_mm, pud, pmd);
629
		spin_unlock(&init_mm.page_table_lock);
Linus Torvalds's avatar
Linus Torvalds committed
630
	}
631
	__flush_tlb_all();
632

633
	update_page_count(PG_LEVEL_1G, pages);
634

635
	return paddr_last;
Thomas Gleixner's avatar
Thomas Gleixner committed
636
}
Linus Torvalds's avatar
Linus Torvalds committed
637

638
639
640
641
642
643
644
645
static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
	      unsigned long page_size_mask)
{
	unsigned long paddr_next, paddr_last = paddr_end;
	unsigned long vaddr = (unsigned long)__va(paddr);
	int i = p4d_index(vaddr);

646
	if (!pgtable_l5_enabled())
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);

	for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
		p4d_t *p4d;
		pud_t *pud;

		vaddr = (unsigned long)__va(paddr);
		p4d = p4d_page + p4d_index(vaddr);
		paddr_next = (paddr & P4D_MASK) + P4D_SIZE;

		if (paddr >= paddr_end) {
			if (!after_bootmem &&
			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
					     E820_TYPE_RAM) &&
			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
					     E820_TYPE_RESERVED_KERN))
				set_p4d(p4d, __p4d(0));
			continue;
		}

		if (!p4d_none(*p4d)) {
			pud = pud_offset(p4d, 0);
			paddr_last = phys_pud_init(pud, paddr,
					paddr_end,
					page_size_mask);
			__flush_tlb_all();
			continue;
		}

		pud = alloc_low_page();
		paddr_last = phys_pud_init(pud, paddr, paddr_end,
					   page_size_mask);

		spin_lock(&init_mm.page_table_lock);
		p4d_populate(&init_mm, p4d, pud);
		spin_unlock(&init_mm.page_table_lock);
	}
	__flush_tlb_all();

	return paddr_last;
}

689
690
/*
 * Create page table mapping for the physical memory for specific physical
691
 * addresses. The virtual and physical addresses have to be aligned on PMD level
692
693
 * down. It returns the last physical address mapped.
 */
694
unsigned long __meminit
695
696
kernel_physical_mapping_init(unsigned long paddr_start,
			     unsigned long paddr_end,
697
			     unsigned long page_size_mask)
Thomas Gleixner's avatar
Thomas Gleixner committed
698
{
699
	bool pgd_changed = false;
700
	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
Linus Torvalds's avatar
Linus Torvalds committed
701

702
703
704
705
	paddr_last = paddr_end;
	vaddr = (unsigned long)__va(paddr_start);
	vaddr_end = (unsigned long)__va(paddr_end);
	vaddr_start = vaddr;
Linus Torvalds's avatar
Linus Torvalds committed
706

707
708
	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
		pgd_t *pgd = pgd_offset_k(vaddr);
709
		p4d_t *p4d;
710

711
		vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
712

713
714
715
		if (pgd_val(*pgd)) {
			p4d = (p4d_t *)pgd_page_vaddr(*pgd);
			paddr_last = phys_p4d_init(p4d, __pa(vaddr),
716
717
						   __pa(vaddr_end),
						   page_size_mask);
718
719
720
			continue;
		}

721
722
		p4d = alloc_low_page();
		paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
723
					   page_size_mask);
724
725

		spin_lock(&init_mm.page_table_lock);
726
		if (pgtable_l5_enabled())
727
728
729
			pgd_populate(&init_mm, pgd, p4d);
		else
			p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
730
		spin_unlock(&init_mm.page_table_lock);
731
		pgd_changed = true;
Thomas Gleixner's avatar
Thomas Gleixner committed
732
	}
733
734

	if (pgd_changed)
735
		sync_global_pgds(vaddr_start, vaddr_end - 1);
736

737
	__flush_tlb_all();
Linus Torvalds's avatar
Linus Torvalds committed
738

739
	return paddr_last;
740
}
741

742
#ifndef CONFIG_NUMA
743
void __init initmem_init(void)
744
{
745
	memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
746
}
747
#endif
748

Linus Torvalds's avatar
Linus Torvalds committed
749
750
void __init paging_init(void)
{
751
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
752
	sparse_init();
753
754
755
756
757
758
759

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
760
761
762
	node_clear_state(0, N_MEMORY);
	if (N_MEMORY != N_NORMAL_MEMORY)
		node_clear_state(0, N_NORMAL_MEMORY);
763

764
	zone_sizes_init();
Linus Torvalds's avatar
Linus Torvalds committed
765
766
}

767
768
769
/*
 * Memory hotplug specific functions
 */
770
#ifdef CONFIG_MEMORY_HOTPLUG
771
772
773
774
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
775
static void update_end_of_memory_vars(u64 start, u64 size)
776
777
778
779
780
781
782
783
784
785
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

786
787
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
		struct vmem_altmap *altmap, bool want_memblock)
788
789
790
{
	int ret;

791
	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
792
	WARN_ON_ONCE(ret);
793

794
	/* update max_pfn, max_low_pfn and high_memory */
795
796
	update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
				  nr_pages << PAGE_SHIFT);
797

798
799
	return ret;
}
800

801
802
int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
		bool want_memblock)
803
804
805
806
807
808
{
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long nr_pages = size >> PAGE_SHIFT;

	init_memory_mapping(start, start + size);

809
	return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
810
}
811

812
813
#define PAGE_INUSE 0xFD

814
static void __meminit free_pagetable(struct page *page, int order)
815
816
817
{
	unsigned long magic;
	unsigned int nr_pages = 1 << order;
818

819
820
821
822
	/* bootmem page has reserved flag */
	if (PageReserved(page)) {
		__ClearPageReserved(page);

823
		magic = (unsigned long)page->freelist;
824
825
826
827
		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
			while (nr_pages--)
				put_page_bootmem(page++);
		} else
828
829
			while (nr_pages--)
				free_reserved_page(page++);
830
831
832
833
	} else
		free_pages((unsigned long)page_address(page), order);
}

834
static void __meminit free_hugepage_table(struct page *page,
835
		struct vmem_altmap *altmap)
836
837
838
839
840
841
842
843
{
	if (altmap)
		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
	else
		free_pagetable(page, get_order(PMD_SIZE));
}

static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
844
845
846
847
848
849
{
	pte_t *pte;
	int i;

	for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
850
		if (!pte_none(*pte))
851
852
853
854
			return;
	}

	/* free a pte talbe */
855
	free_pagetable(pmd_page(*pmd), 0);
856
857
858
859
860
	spin_lock(&init_mm.page_table_lock);
	pmd_clear(pmd);
	spin_unlock(&init_mm.page_table_lock);
}

861
static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
862
863
864
865
866
867
{
	pmd_t *pmd;
	int i;

	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
868
		if (!pmd_none(*pmd))
869
870
871
872
			return;
	}

	/* free a pmd talbe */
873
	free_pagetable(pud_page(*pud), 0);
874
875
876
877
878
	spin_lock(&init_mm.page_table_lock);
	pud_clear(pud);
	spin_unlock(&init_mm.page_table_lock);
}

879
static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
880
881
882
883
884
885
886
887
888
889
890
{
	pud_t *pud;
	int i;

	for (i = 0; i < PTRS_PER_PUD; i++) {
		pud = pud_start + i;
		if (!pud_none(*pud))
			return;
	}

	/* free a pud talbe */
891
	free_pagetable(p4d_page(*p4d), 0);
892
893
894
895
896
	spin_lock(&init_mm.page_table_lock);
	p4d_clear(p4d);
	spin_unlock(&init_mm.page_table_lock);
}

897
898
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
899
		 bool direct)
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
{
	unsigned long next, pages = 0;
	pte_t *pte;
	void *page_addr;
	phys_addr_t phys_addr;

	pte = pte_start + pte_index(addr);
	for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
			next = end;

		if (!pte_present(*pte))
			continue;

		/*
		 * We mapped [0,1G) memory as identity mapping when
		 * initializing, in arch/x86/kernel/head_64.S. These
		 * pagetables cannot be removed.
		 */
		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
		if (phys_addr < (phys_addr_t)0x40000000)
			return;

924
		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
925
926
927
928
929
			/*
			 * Do not free direct mapping pages since they were
			 * freed when offlining, or simplely not in use.
			 */
			if (!direct)
930
				free_pagetable(pte_page(*pte), 0);
931
932
933
934
935
936
937
938