page_alloc.c 212 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
22
#include <linux/jiffies.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
#include <linux/compiler.h>
26
#include <linux/kernel.h>
27
#include <linux/kasan.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
29
30
31
32
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
33
#include <linux/ratelimit.h>
34
#include <linux/oom.h>
Linus Torvalds's avatar
Linus Torvalds committed
35
36
37
38
39
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
40
#include <linux/memory_hotplug.h>
Linus Torvalds's avatar
Linus Torvalds committed
41
42
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
43
#include <linux/vmstat.h>
44
#include <linux/mempolicy.h>
45
#include <linux/memremap.h>
46
#include <linux/stop_machine.h>
47
48
#include <linux/sort.h>
#include <linux/pfn.h>
49
#include <linux/backing-dev.h>
50
#include <linux/fault-inject.h>
51
#include <linux/page-isolation.h>
52
#include <linux/page_ext.h>
53
#include <linux/debugobjects.h>
54
#include <linux/kmemleak.h>
55
#include <linux/compaction.h>
56
#include <trace/events/kmem.h>
57
#include <trace/events/oom.h>
58
#include <linux/prefetch.h>
59
#include <linux/mm_inline.h>
60
#include <linux/migrate.h>
61
#include <linux/hugetlb.h>
62
#include <linux/sched/rt.h>
63
#include <linux/sched/mm.h>
64
#include <linux/page_owner.h>
65
#include <linux/kthread.h>
66
#include <linux/memcontrol.h>
67
#include <linux/ftrace.h>
68
#include <linux/lockdep.h>
69
#include <linux/nmi.h>
Linus Torvalds's avatar
Linus Torvalds committed
70

71
#include <asm/sections.h>
Linus Torvalds's avatar
Linus Torvalds committed
72
#include <asm/tlbflush.h>
73
#include <asm/div64.h>
Linus Torvalds's avatar
Linus Torvalds committed
74
75
#include "internal.h"

76
77
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
78
#define MIN_PERCPU_PAGELIST_FRACTION	(8)
79

80
81
82
83
84
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif

85
86
87
88
89
90
91
92
93
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
 * defined in <linux/topology.h>.
 */
DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
94
int _node_numa_mem_[MAX_NUMNODES];
95
96
#endif

97
98
99
100
/* work_structs for global per-cpu drains */
DEFINE_MUTEX(pcpu_drain_mutex);
DEFINE_PER_CPU(struct work_struct, pcpu_drain);

101
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
102
volatile unsigned long latent_entropy __latent_entropy;
103
104
105
EXPORT_SYMBOL(latent_entropy);
#endif

Linus Torvalds's avatar
Linus Torvalds committed
106
/*
107
 * Array of node states.
Linus Torvalds's avatar
Linus Torvalds committed
108
 */
109
110
111
112
113
114
115
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
	[N_POSSIBLE] = NODE_MASK_ALL,
	[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
	[N_HIGH_MEMORY] = { { [0] = 1UL } },
116
117
#endif
	[N_MEMORY] = { { [0] = 1UL } },
118
119
120
121
122
	[N_CPU] = { { [0] = 1UL } },
#endif	/* NUMA */
};
EXPORT_SYMBOL(node_states);

123
124
125
/* Protect totalram_pages and zone->managed_pages */
static DEFINE_SPINLOCK(managed_page_count_lock);

126
unsigned long totalram_pages __read_mostly;
127
unsigned long totalreserve_pages __read_mostly;
128
unsigned long totalcma_pages __read_mostly;
129

130
int percpu_pagelist_fraction;
131
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
132

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
 * A cached value of the page's pageblock's migratetype, used when the page is
 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
 * Also the migratetype set in the page does not necessarily match the pcplist
 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
 * other index - this ensures that it will be put on the correct CMA freelist.
 */
static inline int get_pcppage_migratetype(struct page *page)
{
	return page->index;
}

static inline void set_pcppage_migratetype(struct page *page, int migratetype)
{
	page->index = migratetype;
}

151
152
153
154
155
156
157
158
159
#ifdef CONFIG_PM_SLEEP
/*
 * The following functions are used by the suspend/hibernate code to temporarily
 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
 * while devices are suspended.  To avoid races with the suspend/hibernate code,
 * they should always be called with pm_mutex held (gfp_allowed_mask also should
 * only be modified with pm_mutex held, unless the suspend/hibernate code is
 * guaranteed not to run in parallel with that modification).
 */
160
161
162
163

static gfp_t saved_gfp_mask;

void pm_restore_gfp_mask(void)
164
165
{
	WARN_ON(!mutex_is_locked(&pm_mutex));
166
167
168
169
	if (saved_gfp_mask) {
		gfp_allowed_mask = saved_gfp_mask;
		saved_gfp_mask = 0;
	}
170
171
}

172
void pm_restrict_gfp_mask(void)
173
174
{
	WARN_ON(!mutex_is_locked(&pm_mutex));
175
176
	WARN_ON(saved_gfp_mask);
	saved_gfp_mask = gfp_allowed_mask;
177
	gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
178
}
179
180
181

bool pm_suspended_storage(void)
{
182
	if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
183
184
185
		return false;
	return true;
}
186
187
#endif /* CONFIG_PM_SLEEP */

188
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
189
unsigned int pageblock_order __read_mostly;
190
191
#endif

192
static void __free_pages_ok(struct page *page, unsigned int order);
193

Linus Torvalds's avatar
Linus Torvalds committed
194
195
196
197
198
199
/*
 * results with 256, 32 in the lowmem_reserve sysctl:
 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
 *	1G machine -> (16M dma, 784M normal, 224M high)
 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
Yaowei Bai's avatar
Yaowei Bai committed
200
 *	HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
201
202
203
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
Linus Torvalds's avatar
Linus Torvalds committed
204
 */
205
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
206
#ifdef CONFIG_ZONE_DMA
207
	 256,
208
#endif
209
#ifdef CONFIG_ZONE_DMA32
210
	 256,
211
#endif
212
#ifdef CONFIG_HIGHMEM
Mel Gorman's avatar
Mel Gorman committed
213
	 32,
214
#endif
Mel Gorman's avatar
Mel Gorman committed
215
	 32,
216
};
Linus Torvalds's avatar
Linus Torvalds committed
217
218
219

EXPORT_SYMBOL(totalram_pages);

220
static char * const zone_names[MAX_NR_ZONES] = {
221
#ifdef CONFIG_ZONE_DMA
222
	 "DMA",
223
#endif
224
#ifdef CONFIG_ZONE_DMA32
225
	 "DMA32",
226
#endif
227
	 "Normal",
228
#ifdef CONFIG_HIGHMEM
Mel Gorman's avatar
Mel Gorman committed
229
	 "HighMem",
230
#endif
Mel Gorman's avatar
Mel Gorman committed
231
	 "Movable",
232
233
234
#ifdef CONFIG_ZONE_DEVICE
	 "Device",
#endif
235
236
};

237
238
239
240
241
242
243
244
245
246
247
248
249
char * const migratetype_names[MIGRATE_TYPES] = {
	"Unmovable",
	"Movable",
	"Reclaimable",
	"HighAtomic",
#ifdef CONFIG_CMA
	"CMA",
#endif
#ifdef CONFIG_MEMORY_ISOLATION
	"Isolate",
#endif
};

250
251
252
253
254
255
compound_page_dtor * const compound_page_dtors[] = {
	NULL,
	free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
	free_huge_page,
#endif
256
257
258
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	free_transhuge_page,
#endif
259
260
};

Linus Torvalds's avatar
Linus Torvalds committed
261
int min_free_kbytes = 1024;
262
int user_min_free_kbytes = -1;
263
int watermark_scale_factor = 10;
Linus Torvalds's avatar
Linus Torvalds committed
264

265
266
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
267
static unsigned long __meminitdata dma_reserve;
Linus Torvalds's avatar
Linus Torvalds committed
268

Tejun Heo's avatar
Tejun Heo committed
269
270
271
272
273
274
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
275
static bool mirrored_kernelcore;
Tejun Heo's avatar
Tejun Heo committed
276
277
278
279
280

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
281

Miklos Szeredi's avatar
Miklos Szeredi committed
282
283
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly = MAX_NUMNODES;
284
int nr_online_nodes __read_mostly = 1;
Miklos Szeredi's avatar
Miklos Szeredi committed
285
EXPORT_SYMBOL(nr_node_ids);
286
EXPORT_SYMBOL(nr_online_nodes);
Miklos Szeredi's avatar
Miklos Szeredi committed
287
288
#endif

289
290
int page_group_by_mobility_disabled __read_mostly;

291
292
293
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
	unsigned long max_initialise;
	unsigned long reserved_lowmem;

	/*
	 * Initialise at least 2G of a node but also take into account that
	 * two large system hashes that can take up 1GB for 0.25TB/node.
	 */
	max_initialise = max(2UL << (30 - PAGE_SHIFT),
		(pgdat->node_spanned_pages >> 8));

	/*
	 * Compensate the all the memblock reservations (e.g. crash kernel)
	 * from the initial estimation to make sure we will initialize enough
	 * memory to boot.
	 */
	reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
			pgdat->node_start_pfn + max_initialise);
	max_initialise += reserved_lowmem;

	pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
314
315
316
317
	pgdat->first_deferred_pfn = ULONG_MAX;
}

/* Returns true if the struct page for the pfn is uninitialised */
318
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
319
{
320
321
322
	int nid = early_pfn_to_nid(pfn);

	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
		return true;

	return false;
}

/*
 * Returns false when the remaining initialisation should be deferred until
 * later in the boot cycle when it can be parallelised.
 */
static inline bool update_defer_init(pg_data_t *pgdat,
				unsigned long pfn, unsigned long zone_end,
				unsigned long *nr_initialised)
{
	/* Always populate low zones for address-contrained allocations */
	if (zone_end < pgdat_end_pfn(pgdat))
		return true;
	(*nr_initialised)++;
340
	if ((*nr_initialised > pgdat->static_init_size) &&
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
		pgdat->first_deferred_pfn = pfn;
		return false;
	}

	return true;
}
#else
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
}

static inline bool early_page_uninitialised(unsigned long pfn)
{
	return false;
}

static inline bool update_defer_init(pg_data_t *pgdat,
				unsigned long pfn, unsigned long zone_end,
				unsigned long *nr_initialised)
{
	return true;
}
#endif

366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct page *page,
							unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
	return __pfn_to_section(pfn)->pageblock_flags;
#else
	return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}

static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
	pfn &= (PAGES_PER_SECTION-1);
	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}

/**
 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @pfn: The target page frame number
 * @end_bitidx: The last bit of interest to retrieve
 * @mask: mask of bits that the caller is interested in
 *
 * Return: pageblock_bits flags
 */
static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
					unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	unsigned long *bitmap;
	unsigned long bitidx, word_bitidx;
	unsigned long word;

	bitmap = get_pageblock_bitmap(page, pfn);
	bitidx = pfn_to_bitidx(page, pfn);
	word_bitidx = bitidx / BITS_PER_LONG;
	bitidx &= (BITS_PER_LONG-1);

	word = bitmap[word_bitidx];
	bitidx += end_bitidx;
	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
}

static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
	return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
}

/**
 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @flags: The flags to set
 * @pfn: The target page frame number
 * @end_bitidx: The last bit of interest
 * @mask: mask of bits that the caller is interested in
 */
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
					unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	unsigned long *bitmap;
	unsigned long bitidx, word_bitidx;
	unsigned long old_word, word;

	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

	bitmap = get_pageblock_bitmap(page, pfn);
	bitidx = pfn_to_bitidx(page, pfn);
	word_bitidx = bitidx / BITS_PER_LONG;
	bitidx &= (BITS_PER_LONG-1);

	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);

	bitidx += end_bitidx;
	mask <<= (BITS_PER_LONG - bitidx - 1);
	flags <<= (BITS_PER_LONG - bitidx - 1);

	word = READ_ONCE(bitmap[word_bitidx]);
	for (;;) {
		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
		if (word == old_word)
			break;
		word = old_word;
	}
}
466

467
void set_pageblock_migratetype(struct page *page, int migratetype)
468
{
469
470
	if (unlikely(page_group_by_mobility_disabled &&
		     migratetype < MIGRATE_PCPTYPES))
471
472
		migratetype = MIGRATE_UNMOVABLE;

473
474
475
476
	set_pageblock_flags_group(page, (unsigned long)migratetype,
					PB_migrate, PB_migrate_end);
}

Nick Piggin's avatar
Nick Piggin committed
477
#ifdef CONFIG_DEBUG_VM
478
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
479
{
480
481
482
	int ret = 0;
	unsigned seq;
	unsigned long pfn = page_to_pfn(page);
483
	unsigned long sp, start_pfn;
484

485
486
	do {
		seq = zone_span_seqbegin(zone);
487
488
		start_pfn = zone->zone_start_pfn;
		sp = zone->spanned_pages;
489
		if (!zone_spans_pfn(zone, pfn))
490
491
492
			ret = 1;
	} while (zone_span_seqretry(zone, seq));

493
	if (ret)
494
495
496
		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
			pfn, zone_to_nid(zone), zone->name,
			start_pfn, start_pfn + sp);
497

498
	return ret;
499
500
501
502
}

static int page_is_consistent(struct zone *zone, struct page *page)
{
503
	if (!pfn_valid_within(page_to_pfn(page)))
504
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
505
	if (zone != page_zone(page))
506
507
508
509
510
511
512
		return 0;

	return 1;
}
/*
 * Temporary debugging check for pages not lying within a given zone.
 */
513
static int __maybe_unused bad_range(struct zone *zone, struct page *page)
514
515
{
	if (page_outside_zone_boundaries(zone, page))
Linus Torvalds's avatar
Linus Torvalds committed
516
		return 1;
517
518
519
	if (!page_is_consistent(zone, page))
		return 1;

Linus Torvalds's avatar
Linus Torvalds committed
520
521
	return 0;
}
Nick Piggin's avatar
Nick Piggin committed
522
#else
523
static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
Nick Piggin's avatar
Nick Piggin committed
524
525
526
527
528
{
	return 0;
}
#endif

529
530
static void bad_page(struct page *page, const char *reason,
		unsigned long bad_flags)
Linus Torvalds's avatar
Linus Torvalds committed
531
{
532
533
534
535
536
537
538
539
540
541
542
543
544
545
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			goto out;
		}
		if (nr_unshown) {
546
			pr_alert(
547
			      "BUG: Bad page state: %lu messages suppressed\n",
548
549
550
551
552
553
554
555
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;

556
	pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
557
		current->comm, page_to_pfn(page));
558
559
560
561
562
	__dump_page(page, reason);
	bad_flags &= page->flags;
	if (bad_flags)
		pr_alert("bad because of flags: %#lx(%pGp)\n",
						bad_flags, &bad_flags);
563
	dump_page_owner(page);
564

565
	print_modules();
Linus Torvalds's avatar
Linus Torvalds committed
566
	dump_stack();
567
out:
568
	/* Leave bad fields for debug, except PageBuddy could make trouble */
569
	page_mapcount_reset(page); /* remove PageBuddy */
570
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
Linus Torvalds's avatar
Linus Torvalds committed
571
572
573
574
575
}

/*
 * Higher-order pages are called "compound pages".  They are structured thusly:
 *
576
 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
Linus Torvalds's avatar
Linus Torvalds committed
577
 *
578
579
 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
Linus Torvalds's avatar
Linus Torvalds committed
580
 *
581
582
 * The first tail page's ->compound_dtor holds the offset in array of compound
 * page destructors. See compound_page_dtors.
Linus Torvalds's avatar
Linus Torvalds committed
583
 *
584
 * The first tail page's ->compound_order holds the order of allocation.
585
 * This usage means that zero-order pages may not be compound.
Linus Torvalds's avatar
Linus Torvalds committed
586
 */
587

588
void free_compound_page(struct page *page)
589
{
590
	__free_pages_ok(page, compound_order(page));
591
592
}

593
void prep_compound_page(struct page *page, unsigned int order)
594
595
596
597
{
	int i;
	int nr_pages = 1 << order;

598
	set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
599
600
601
602
	set_compound_order(page, order);
	__SetPageHead(page);
	for (i = 1; i < nr_pages; i++) {
		struct page *p = page + i;
603
		set_page_count(p, 0);
604
		p->mapping = TAIL_MAPPING;
605
		set_compound_head(p, page);
606
	}
607
	atomic_set(compound_mapcount_ptr(page), -1);
608
609
}

610
611
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
612
613
bool _debug_pagealloc_enabled __read_mostly
			= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
614
EXPORT_SYMBOL(_debug_pagealloc_enabled);
615
616
bool _debug_guardpage_enabled __read_mostly;

617
618
619
620
static int __init early_debug_pagealloc(char *buf)
{
	if (!buf)
		return -EINVAL;
621
	return kstrtobool(buf, &_debug_pagealloc_enabled);
622
623
624
}
early_param("debug_pagealloc", early_debug_pagealloc);

625
626
static bool need_debug_guardpage(void)
{
627
628
629
630
	/* If we don't use debug_pagealloc, we don't need guard page */
	if (!debug_pagealloc_enabled())
		return false;

631
632
633
	if (!debug_guardpage_minorder())
		return false;

634
635
636
637
638
	return true;
}

static void init_debug_guardpage(void)
{
639
640
641
	if (!debug_pagealloc_enabled())
		return;

642
643
644
	if (!debug_guardpage_minorder())
		return;

645
646
647
648
649
650
651
	_debug_guardpage_enabled = true;
}

struct page_ext_operations debug_guardpage_ops = {
	.need = need_debug_guardpage,
	.init = init_debug_guardpage,
};
652
653
654
655
656
657

static int __init debug_guardpage_minorder_setup(char *buf)
{
	unsigned long res;

	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
658
		pr_err("Bad debug_guardpage_minorder value\n");
659
660
661
		return 0;
	}
	_debug_guardpage_minorder = res;
662
	pr_info("Setting debug_guardpage_minorder to %lu\n", res);
663
664
	return 0;
}
665
early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
666

667
static inline bool set_page_guard(struct zone *zone, struct page *page,
668
				unsigned int order, int migratetype)
669
{
670
671
672
	struct page_ext *page_ext;

	if (!debug_guardpage_enabled())
673
674
675
676
		return false;

	if (order >= debug_guardpage_minorder())
		return false;
677
678

	page_ext = lookup_page_ext(page);
679
	if (unlikely(!page_ext))
680
		return false;
681

682
683
	__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);

684
685
686
687
	INIT_LIST_HEAD(&page->lru);
	set_page_private(page, order);
	/* Guard pages are not available for any usage */
	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
688
689

	return true;
690
691
}

692
693
static inline void clear_page_guard(struct zone *zone, struct page *page,
				unsigned int order, int migratetype)
694
{
695
696
697
698
699
700
	struct page_ext *page_ext;

	if (!debug_guardpage_enabled())
		return;

	page_ext = lookup_page_ext(page);
701
702
703
	if (unlikely(!page_ext))
		return;

704
705
	__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);

706
707
708
	set_page_private(page, 0);
	if (!is_migrate_isolate(migratetype))
		__mod_zone_freepage_state(zone, (1 << order), migratetype);
709
710
}
#else
711
struct page_ext_operations debug_guardpage_ops;
712
713
static inline bool set_page_guard(struct zone *zone, struct page *page,
			unsigned int order, int migratetype) { return false; }
714
715
static inline void clear_page_guard(struct zone *zone, struct page *page,
				unsigned int order, int migratetype) {}
716
717
#endif

718
static inline void set_page_order(struct page *page, unsigned int order)
719
{
720
	set_page_private(page, order);
721
	__SetPageBuddy(page);
Linus Torvalds's avatar
Linus Torvalds committed
722
723
724
725
}

static inline void rmv_page_order(struct page *page)
{
726
	__ClearPageBuddy(page);
727
	set_page_private(page, 0);
Linus Torvalds's avatar
Linus Torvalds committed
728
729
730
731
732
}

/*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
733
 * (a) the buddy is not in a hole (check before calling!) &&
734
 * (b) the buddy is in the buddy system &&
735
736
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
737
 *
738
739
740
741
 * For recording whether a page is in the buddy system, we set ->_mapcount
 * PAGE_BUDDY_MAPCOUNT_VALUE.
 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
 * serialized by zone->lock.
Linus Torvalds's avatar
Linus Torvalds committed
742
 *
743
 * For recording page's order, we use page_private(page).
Linus Torvalds's avatar
Linus Torvalds committed
744
 */
745
static inline int page_is_buddy(struct page *page, struct page *buddy,
746
							unsigned int order)
Linus Torvalds's avatar
Linus Torvalds committed
747
{
748
	if (page_is_guard(buddy) && page_order(buddy) == order) {
749
750
751
		if (page_zone_id(page) != page_zone_id(buddy))
			return 0;

752
753
		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

754
755
756
		return 1;
	}

757
	if (PageBuddy(buddy) && page_order(buddy) == order) {
758
759
760
761
762
763
764
765
		/*
		 * zone check is done late to avoid uselessly
		 * calculating zone/node ids for pages that could
		 * never merge.
		 */
		if (page_zone_id(page) != page_zone_id(buddy))
			return 0;

766
767
		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

768
		return 1;
769
	}
770
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
}

/*
 * Freeing function for a buddy system allocator.
 *
 * The concept of a buddy system is to maintain direct-mapped table
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
786
787
788
 * free pages of length of (1 << order) and marked with _mapcount
 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
 * field.
Linus Torvalds's avatar
Linus Torvalds committed
789
 * So when we are allocating or freeing one, we can derive the state of the
790
791
 * other.  That is, if we allocate a small block, and both were
 * free, the remainder of the region must be split into blocks.
Linus Torvalds's avatar
Linus Torvalds committed
792
 * If a block is freed, and its buddy is also free, then this
793
 * triggers coalescing into a block of larger size.
Linus Torvalds's avatar
Linus Torvalds committed
794
 *
795
 * -- nyc
Linus Torvalds's avatar
Linus Torvalds committed
796
797
 */

Nick Piggin's avatar
Nick Piggin committed
798
static inline void __free_one_page(struct page *page,
799
		unsigned long pfn,
800
801
		struct zone *zone, unsigned int order,
		int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
802
{
803
804
	unsigned long combined_pfn;
	unsigned long uninitialized_var(buddy_pfn);
805
	struct page *buddy;
806
807
808
	unsigned int max_order;

	max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
Linus Torvalds's avatar
Linus Torvalds committed
809

810
	VM_BUG_ON(!zone_is_initialized(zone));
811
	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
Linus Torvalds's avatar
Linus Torvalds committed
812

813
	VM_BUG_ON(migratetype == -1);
814
	if (likely(!is_migrate_isolate(migratetype)))
815
		__mod_zone_freepage_state(zone, 1 << order, migratetype);
816

817
	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
818
	VM_BUG_ON_PAGE(bad_range(zone, page), page);
Linus Torvalds's avatar
Linus Torvalds committed
819

820
continue_merging:
821
	while (order < max_order - 1) {
822
823
		buddy_pfn = __find_buddy_pfn(pfn, order);
		buddy = page + (buddy_pfn - pfn);
824
825
826

		if (!pfn_valid_within(buddy_pfn))
			goto done_merging;
827
		if (!page_is_buddy(page, buddy, order))
828
			goto done_merging;
829
830
831
832
833
		/*
		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
		 * merge with it and move up one order.
		 */
		if (page_is_guard(buddy)) {
834
			clear_page_guard(zone, buddy, order, migratetype);
835
836
837
838
839
		} else {
			list_del(&buddy->lru);
			zone->free_area[order].nr_free--;
			rmv_page_order(buddy);
		}
840
841
842
		combined_pfn = buddy_pfn & pfn;
		page = page + (combined_pfn - pfn);
		pfn = combined_pfn;
Linus Torvalds's avatar
Linus Torvalds committed
843
844
		order++;
	}
845
846
847
848
849
850
851
852
853
854
855
856
	if (max_order < MAX_ORDER) {
		/* If we are here, it means order is >= pageblock_order.
		 * We want to prevent merge between freepages on isolate
		 * pageblock and normal pageblock. Without this, pageblock
		 * isolation could cause incorrect freepage or CMA accounting.
		 *
		 * We don't want to hit this code for the more frequent
		 * low-order merging.
		 */
		if (unlikely(has_isolate_pageblock(zone))) {
			int buddy_mt;

857
858
			buddy_pfn = __find_buddy_pfn(pfn, order);
			buddy = page + (buddy_pfn - pfn);
859
860
861
862
863
864
865
866
867
868
869
870
			buddy_mt = get_pageblock_migratetype(buddy);

			if (migratetype != buddy_mt
					&& (is_migrate_isolate(migratetype) ||
						is_migrate_isolate(buddy_mt)))
				goto done_merging;
		}
		max_order++;
		goto continue_merging;
	}

done_merging:
Linus Torvalds's avatar
Linus Torvalds committed
871
	set_page_order(page, order);
872
873
874
875
876
877
878
879
880

	/*
	 * If this is not the largest possible page, check if the buddy
	 * of the next-highest order is free. If it is, it's possible
	 * that pages are being freed that will coalesce soon. In case,
	 * that is happening, add the free page to the tail of the list
	 * so it's less likely to be used soon and more likely to be merged
	 * as a higher order page
	 */
881
	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
882
		struct page *higher_page, *higher_buddy;
883
884
885
886
		combined_pfn = buddy_pfn & pfn;
		higher_page = page + (combined_pfn - pfn);
		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
887
888
		if (pfn_valid_within(buddy_pfn) &&
		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
889
890
891
892
893
894
895
896
			list_add_tail(&page->lru,
				&zone->free_area[order].free_list[migratetype]);
			goto out;
		}
	}

	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
Linus Torvalds's avatar
Linus Torvalds committed
897
898
899
	zone->free_area[order].nr_free++;
}

900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
/*
 * A bad page could be due to a number of fields. Instead of multiple branches,
 * try and check multiple fields with one check. The caller must do a detailed
 * check if necessary.
 */
static inline bool page_expected_state(struct page *page,
					unsigned long check_flags)
{
	if (unlikely(atomic_read(&page->_mapcount) != -1))
		return false;

	if (unlikely((unsigned long)page->mapping |
			page_ref_count(page) |
#ifdef CONFIG_MEMCG
			(unsigned long)page->mem_cgroup |
#endif
			(page->flags & check_flags)))
		return false;

	return true;
}

922
static void free_pages_check_bad(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
923
{
924
925
926
927
928
	const char *bad_reason;
	unsigned long bad_flags;

	bad_reason = NULL;
	bad_flags = 0;
929

930
	if (unlikely(atomic_read(&page->_mapcount) != -1))
931
932
933
		bad_reason = "nonzero mapcount";
	if (unlikely(page->mapping != NULL))
		bad_reason = "non-NULL mapping";
934
	if (unlikely(page_ref_count(page) != 0))
935
		bad_reason = "nonzero _refcount";
936
937
938
939
	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
	}
940
941
942
943
#ifdef CONFIG_MEMCG
	if (unlikely(page->mem_cgroup))
		bad_reason = "page still charged to cgroup";
#endif
944
	bad_page(page, bad_reason, bad_flags);
945
946
947
948
}

static inline int free_pages_check(struct page *page)
{
949
	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
950
951
952
953
		return 0;

	/* Something has gone sideways, find it */
	free_pages_check_bad(page);
954
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
955
956
}

957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
	int ret = 1;

	/*
	 * We rely page->lru.next never has bit 0 set, unless the page
	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
	 */
	BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);

	if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
		ret = 0;
		goto out;
	}
	switch (page - head_page) {
	case 1:
		/* the first tail page: ->mapping is compound_mapcount() */
		if (unlikely(compound_mapcount(page))) {
			bad_page(page, "nonzero compound_mapcount", 0);
			goto out;
		}
		break;
	case 2:
		/*
		 * the second tail page: ->mapping is
		 * page_deferred_list().next -- ignore value.
		 */
		break;
	default:
		if (page->mapping != TAIL_MAPPING) {
			bad_page(page, "corrupted mapping in tail page", 0);
			goto out;
		}
		break;
	}
	if (unlikely(!PageTail(page))) {
		bad_page(page, "PageTail not set", 0);
		goto out;
	}
	if (unlikely(compound_head(page) != head_page)) {
		bad_page(page, "compound_head not consistent", 0);
		goto out;
	}
	ret = 0;
out:
	page->mapping = NULL;
	clear_compound_head(page);
	return ret;
}

1007
1008
static __always_inline bool free_pages_prepare(struct page *page,
					unsigned int order, bool check_free)
1009
{
1010
	int bad = 0;
1011
1012
1013

	VM_BUG_ON_PAGE(PageTail(page), page);

1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
	trace_mm_page_free(page, order);

	/*
	 * Check tail pages before head page information is cleared to
	 * avoid checking PageCompound for order-0 pages.
	 */
	if (unlikely(order)) {
		bool compound = PageCompound(page);
		int i;

		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1025

1026
1027
		if (compound)
			ClearPageDoubleMap(page);
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
		for (i = 1; i < (1 << order); i++) {
			if (compound)
				bad += free_tail_pages_check(page, page + i);
			if (unlikely(free_pages_check(page + i))) {
				bad++;
				continue;
			}
			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
		}
	}
1038
	if (PageMappingFlags(page))
1039
		page->mapping = NULL;
1040
	if (memcg_kmem_enabled() && PageKmemcg(page))
1041
		memcg_kmem_uncharge(page, order);
1042
1043
1044
1045
	if (check_free)
		bad += free_pages_check(page);
	if (bad)
		return false;
1046

1047
1048
1049
	page_cpupid_reset_last(page);
	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
	reset_page_owner(page, order);
1050
1051
1052

	if (!PageHighMem(page)) {
		debug_check_no_locks_freed(page_address(page),
1053
					   PAGE_SIZE << order);
1054
		debug_check_no_obj_freed(page_address(page),
1055
					   PAGE_SIZE << order);
1056
	}
1057
1058
1059
	arch_free_page(page, order);
	kernel_poison_pages(page, 1 << order, 0);
	kernel_map_pages(page, 1 << order, 0);
1060
	kasan_free_pages(page, order);
1061
1062
1063
1064

	return true;
}

1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
#ifdef CONFIG_DEBUG_VM
static inline bool free_pcp_prepare(struct page *page)
{
	return free_pages_prepare(page, 0, true);
}

static inline bool bulkfree_pcp_prepare(struct page *page)
{
	return false;
}
#else
static bool free_pcp_prepare(struct page *page)
{
	return free_pages_prepare(page, 0, false);
}

1081
1082
1083
1084
1085
1086
static bool bulkfree_pcp_prepare(struct page *page)
{
	return free_pages_check(page);
}
#endif /* CONFIG_DEBUG_VM */

Linus Torvalds's avatar
Linus Torvalds committed
1087
/*
1088
 * Frees a number of pages from the PCP lists
Linus Torvalds's avatar
Linus Torvalds committed
1089
 * Assumes all pages on list are in same zone, and of same order.
1090
 * count is the number of pages to free.
Linus Torvalds's avatar
Linus Torvalds committed
1091
1092
1093
1094
1095
1096
1097
 *
 * If the zone was previously in an "all pages pinned" state then look to
 * see if this freeing clears that state.
 *
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
1098
1099
static void free_pcppages_bulk(struct zone *zone, int count,
					struct per_cpu_pages *pcp)
Linus Torvalds's avatar
Linus Torvalds committed
1100
{
1101
	int migratetype = 0;
1102
	int batch_free = 0;
1103
	bool isolated_pageblocks;
1104

1105
	spin_lock(&zone->lock);
1106
	isolated_pageblocks = has_isolate_pageblock(zone);
1107

1108
	while (count) {
Nick Piggin's avatar
Nick Piggin committed
1109
		struct page *page;
1110
1111
1112
		struct list_head *list;

		/*
1113
1114
1115
1116
1117
		 * Remove pages from lists in a round-robin fashion. A
		 * batch_free count is maintained that is incremented when an
		 * empty list is encountered.  This is so more pages are freed
		 * off fuller lists instead of spinning excessively around empty
		 * lists
1118
1119
		 */
		do {
1120
			batch_free++;
1121
1122
1123
1124
			if (++migratetype == MIGRATE_PCPTYPES)
				migratetype = 0;
			list = &pcp->lists[migratetype];
		} while (list_empty(list));
Nick Piggin's avatar
Nick Piggin committed
1125