page_owner.c 15.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
3
4
5
6
7
8
#include <linux/debugfs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/bootmem.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
9
#include <linux/jump_label.h>
10
#include <linux/migrate.h>
11
#include <linux/stackdepot.h>
12
#include <linux/seq_file.h>
13

14
15
#include "internal.h"

16
17
18
19
20
21
/*
 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
 * to use off stack temporal storage
 */
#define PAGE_OWNER_STACK_DEPTH (16)

22
struct page_owner {
23
24
	unsigned short order;
	short last_migrate_reason;
25
26
27
28
	gfp_t gfp_mask;
	depot_stack_handle_t handle;
};

29
static bool page_owner_disabled = true;
30
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
31

32
33
static depot_stack_handle_t dummy_handle;
static depot_stack_handle_t failure_handle;
34
static depot_stack_handle_t early_handle;
35

36
37
static void init_early_allocated_pages(void);

38
static int __init early_page_owner_param(char *buf)
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
{
	if (!buf)
		return -EINVAL;

	if (strcmp(buf, "on") == 0)
		page_owner_disabled = false;

	return 0;
}
early_param("page_owner", early_page_owner_param);

static bool need_page_owner(void)
{
	if (page_owner_disabled)
		return false;

	return true;
}

58
static __always_inline depot_stack_handle_t create_dummy_stack(void)
59
60
61
62
63
64
65
66
67
68
{
	unsigned long entries[4];
	struct stack_trace dummy;

	dummy.nr_entries = 0;
	dummy.max_entries = ARRAY_SIZE(entries);
	dummy.entries = &entries[0];
	dummy.skip = 0;

	save_stack_trace(&dummy);
69
	return depot_save_stack(&dummy, GFP_KERNEL);
70
71
}

72
static noinline void register_dummy_stack(void)
73
{
74
75
	dummy_handle = create_dummy_stack();
}
76

77
78
79
80
static noinline void register_failure_stack(void)
{
	failure_handle = create_dummy_stack();
}
81

82
83
84
static noinline void register_early_stack(void)
{
	early_handle = create_dummy_stack();
85
86
}

87
88
89
90
91
static void init_page_owner(void)
{
	if (page_owner_disabled)
		return;

92
93
	register_dummy_stack();
	register_failure_stack();
94
	register_early_stack();
95
	static_branch_enable(&page_owner_inited);
96
	init_early_allocated_pages();
97
98
99
}

struct page_ext_operations page_owner_ops = {
100
	.size = sizeof(struct page_owner),
101
102
103
104
	.need = need_page_owner,
	.init = init_page_owner,
};

105
106
107
108
109
static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
{
	return (void *)page_ext + page_owner_ops.offset;
}

110
111
112
113
114
115
116
void __reset_page_owner(struct page *page, unsigned int order)
{
	int i;
	struct page_ext *page_ext;

	for (i = 0; i < (1 << order); i++) {
		page_ext = lookup_page_ext(page + i);
117
118
		if (unlikely(!page_ext))
			continue;
119
120
121
122
		__clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
	}
}

123
124
static inline bool check_recursive_alloc(struct stack_trace *trace,
					unsigned long ip)
125
{
126
	int i;
127
128
129
130

	if (!trace->nr_entries)
		return false;

131
132
	for (i = 0; i < trace->nr_entries; i++) {
		if (trace->entries[i] == ip)
133
134
			return true;
	}
135

136
137
138
139
140
141
	return false;
}

static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
	unsigned long entries[PAGE_OWNER_STACK_DEPTH];
142
143
	struct stack_trace trace = {
		.nr_entries = 0,
144
145
		.entries = entries,
		.max_entries = PAGE_OWNER_STACK_DEPTH,
146
		.skip = 2
147
	};
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
	depot_stack_handle_t handle;

	save_stack_trace(&trace);
	if (trace.nr_entries != 0 &&
	    trace.entries[trace.nr_entries-1] == ULONG_MAX)
		trace.nr_entries--;

	/*
	 * We need to check recursion here because our request to stackdepot
	 * could trigger memory allocation to save new entry. New memory
	 * allocation would reach here and call depot_save_stack() again
	 * if we don't catch it. There is still not enough memory in stackdepot
	 * so it would try to allocate memory again and loop forever.
	 */
	if (check_recursive_alloc(&trace, _RET_IP_))
		return dummy_handle;

	handle = depot_save_stack(&trace, flags);
	if (!handle)
		handle = failure_handle;

	return handle;
}

172
173
static inline void __set_page_owner_handle(struct page_ext *page_ext,
	depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
174
{
175
	struct page_owner *page_owner;
176

177
	page_owner = get_page_owner(page_ext);
178
	page_owner->handle = handle;
179
180
181
	page_owner->order = order;
	page_owner->gfp_mask = gfp_mask;
	page_owner->last_migrate_reason = -1;
182
183
184
185

	__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}

186
187
188
189
190
191
192
193
194
195
196
197
198
noinline void __set_page_owner(struct page *page, unsigned int order,
					gfp_t gfp_mask)
{
	struct page_ext *page_ext = lookup_page_ext(page);
	depot_stack_handle_t handle;

	if (unlikely(!page_ext))
		return;

	handle = save_stack(gfp_mask);
	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
}

199
200
201
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
	struct page_ext *page_ext = lookup_page_ext(page);
202
203
	struct page_owner *page_owner;

204
205
	if (unlikely(!page_ext))
		return;
206

207
208
	page_owner = get_page_owner(page_ext);
	page_owner->last_migrate_reason = reason;
209
210
}

211
void __split_page_owner(struct page *page, unsigned int order)
212
{
213
	int i;
214
	struct page_ext *page_ext = lookup_page_ext(page);
215
	struct page_owner *page_owner;
216

217
	if (unlikely(!page_ext))
218
		return;
219

220
221
	page_owner = get_page_owner(page_ext);
	page_owner->order = 0;
222
223
	for (i = 1; i < (1 << order); i++)
		__copy_page_owner(page, page + i);
224
225
}

226
227
228
229
void __copy_page_owner(struct page *oldpage, struct page *newpage)
{
	struct page_ext *old_ext = lookup_page_ext(oldpage);
	struct page_ext *new_ext = lookup_page_ext(newpage);
230
	struct page_owner *old_page_owner, *new_page_owner;
231

232
233
234
	if (unlikely(!old_ext || !new_ext))
		return;

235
236
237
238
239
240
241
	old_page_owner = get_page_owner(old_ext);
	new_page_owner = get_page_owner(new_ext);
	new_page_owner->order = old_page_owner->order;
	new_page_owner->gfp_mask = old_page_owner->gfp_mask;
	new_page_owner->last_migrate_reason =
		old_page_owner->last_migrate_reason;
	new_page_owner->handle = old_page_owner->handle;
242
243
244
245
246
247
248
249
250
251
252
253
254

	/*
	 * We don't clear the bit on the oldpage as it's going to be freed
	 * after migration. Until then, the info can be useful in case of
	 * a bug, and the overal stats will be off a bit only temporarily.
	 * Also, migrate_misplaced_transhuge_page() can still fail the
	 * migration and then we want the oldpage to retain the info. But
	 * in that case we also don't need to explicitly clear the info from
	 * the new page, which will be freed.
	 */
	__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
}

255
256
257
258
259
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
				       pg_data_t *pgdat, struct zone *zone)
{
	struct page *page;
	struct page_ext *page_ext;
260
	struct page_owner *page_owner;
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
	unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
	unsigned long end_pfn = pfn + zone->spanned_pages;
	unsigned long count[MIGRATE_TYPES] = { 0, };
	int pageblock_mt, page_mt;
	int i;

	/* Scan block by block. First and last block may be incomplete */
	pfn = zone->zone_start_pfn;

	/*
	 * Walk the zone in pageblock_nr_pages steps. If a page block spans
	 * a zone boundary, it will be double counted between zones. This does
	 * not matter as the mixed block count will still be correct
	 */
	for (; pfn < end_pfn; ) {
		if (!pfn_valid(pfn)) {
			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
			continue;
		}

		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
		block_end_pfn = min(block_end_pfn, end_pfn);

		page = pfn_to_page(pfn);
		pageblock_mt = get_pageblock_migratetype(page);

		for (; pfn < block_end_pfn; pfn++) {
			if (!pfn_valid_within(pfn))
				continue;

			page = pfn_to_page(pfn);

			if (page_zone(page) != zone)
				continue;

			if (PageBuddy(page)) {
297
298
299
300
301
				unsigned long freepage_order;

				freepage_order = page_order_unsafe(page);
				if (freepage_order < MAX_ORDER)
					pfn += (1UL << freepage_order) - 1;
302
303
304
305
306
307
308
309
310
311
312
313
314
				continue;
			}

			if (PageReserved(page))
				continue;

			page_ext = lookup_page_ext(page);
			if (unlikely(!page_ext))
				continue;

			if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
				continue;

315
316
317
			page_owner = get_page_owner(page_ext);
			page_mt = gfpflags_to_migratetype(
					page_owner->gfp_mask);
318
319
320
321
322
323
324
325
326
			if (pageblock_mt != page_mt) {
				if (is_migrate_cma(pageblock_mt))
					count[MIGRATE_MOVABLE]++;
				else
					count[pageblock_mt]++;

				pfn = block_end_pfn;
				break;
			}
327
			pfn += (1UL << page_owner->order) - 1;
328
329
330
331
332
333
334
335
336
337
		}
	}

	/* Print counts */
	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
	for (i = 0; i < MIGRATE_TYPES; i++)
		seq_printf(m, "%12lu ", count[i]);
	seq_putc(m, '\n');
}

338
339
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
340
		struct page *page, struct page_owner *page_owner,
341
		depot_stack_handle_t handle)
342
343
344
345
{
	int ret;
	int pageblock_mt, page_mt;
	char *kbuf;
346
	unsigned long entries[PAGE_OWNER_STACK_DEPTH];
347
	struct stack_trace trace = {
348
349
350
351
		.nr_entries = 0,
		.entries = entries,
		.max_entries = PAGE_OWNER_STACK_DEPTH,
		.skip = 0
352
	};
353
354
355
356
357
358

	kbuf = kmalloc(count, GFP_KERNEL);
	if (!kbuf)
		return -ENOMEM;

	ret = snprintf(kbuf, count,
359
			"Page allocated via order %u, mask %#x(%pGg)\n",
360
361
			page_owner->order, page_owner->gfp_mask,
			&page_owner->gfp_mask);
362
363
364
365
366

	if (ret >= count)
		goto err;

	/* Print information relevant to grouping pages by mobility */
367
	pageblock_mt = get_pageblock_migratetype(page);
368
	page_mt  = gfpflags_to_migratetype(page_owner->gfp_mask);
369
	ret += snprintf(kbuf + ret, count - ret,
370
			"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
371
			pfn,
372
			migratetype_names[page_mt],
373
			pfn >> pageblock_order,
374
375
			migratetype_names[pageblock_mt],
			page->flags, &page->flags);
376
377
378
379

	if (ret >= count)
		goto err;

380
	depot_fetch_stack(handle, &trace);
381
	ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
382
383
384
	if (ret >= count)
		goto err;

385
	if (page_owner->last_migrate_reason != -1) {
386
387
		ret += snprintf(kbuf + ret, count - ret,
			"Page has been migrated, last migrate reason: %s\n",
388
			migrate_reason_names[page_owner->last_migrate_reason]);
389
390
391
392
		if (ret >= count)
			goto err;
	}

393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
	ret += snprintf(kbuf + ret, count - ret, "\n");
	if (ret >= count)
		goto err;

	if (copy_to_user(buf, kbuf, ret))
		ret = -EFAULT;

	kfree(kbuf);
	return ret;

err:
	kfree(kbuf);
	return -ENOMEM;
}

408
409
410
void __dump_page_owner(struct page *page)
{
	struct page_ext *page_ext = lookup_page_ext(page);
411
	struct page_owner *page_owner;
412
	unsigned long entries[PAGE_OWNER_STACK_DEPTH];
413
	struct stack_trace trace = {
414
415
416
417
		.nr_entries = 0,
		.entries = entries,
		.max_entries = PAGE_OWNER_STACK_DEPTH,
		.skip = 0
418
	};
419
	depot_stack_handle_t handle;
420
421
	gfp_t gfp_mask;
	int mt;
422

423
424
425
426
	if (unlikely(!page_ext)) {
		pr_alert("There is not page extension available.\n");
		return;
	}
427
428
429

	page_owner = get_page_owner(page_ext);
	gfp_mask = page_owner->gfp_mask;
430
	mt = gfpflags_to_migratetype(gfp_mask);
431

432
433
434
435
436
	if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
		pr_alert("page_owner info is not active (free page?)\n");
		return;
	}

437
	handle = READ_ONCE(page_owner->handle);
438
439
440
441
442
443
	if (!handle) {
		pr_alert("page_owner info is not active (free page?)\n");
		return;
	}

	depot_fetch_stack(handle, &trace);
Joe Perches's avatar
Joe Perches committed
444
	pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
445
		 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
446
447
	print_stack_trace(&trace, 0);

448
	if (page_owner->last_migrate_reason != -1)
449
		pr_alert("page has been migrated, last migrate reason: %s\n",
450
			migrate_reason_names[page_owner->last_migrate_reason]);
451
452
}

453
454
455
456
457
458
static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
	unsigned long pfn;
	struct page *page;
	struct page_ext *page_ext;
459
	struct page_owner *page_owner;
460
	depot_stack_handle_t handle;
461

462
	if (!static_branch_unlikely(&page_owner_inited))
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
		return -EINVAL;

	page = NULL;
	pfn = min_low_pfn + *ppos;

	/* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
	while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
		pfn++;

	drain_all_pages(NULL);

	/* Find an allocated page */
	for (; pfn < max_pfn; pfn++) {
		/*
		 * If the new page is in a new MAX_ORDER_NR_PAGES area,
		 * validate the area as existing, skip it if not
		 */
		if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
			pfn += MAX_ORDER_NR_PAGES - 1;
			continue;
		}

		/* Check for holes within a MAX_ORDER area */
		if (!pfn_valid_within(pfn))
			continue;

		page = pfn_to_page(pfn);
		if (PageBuddy(page)) {
			unsigned long freepage_order = page_order_unsafe(page);

			if (freepage_order < MAX_ORDER)
				pfn += (1UL << freepage_order) - 1;
			continue;
		}

		page_ext = lookup_page_ext(page);
499
500
		if (unlikely(!page_ext))
			continue;
501
502

		/*
503
504
		 * Some pages could be missed by concurrent allocation or free,
		 * because we don't hold the zone lock.
505
506
507
508
		 */
		if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
			continue;

509
510
		page_owner = get_page_owner(page_ext);

511
512
513
514
		/*
		 * Access to page_ext->handle isn't synchronous so we should
		 * be careful to access it.
		 */
515
		handle = READ_ONCE(page_owner->handle);
516
517
518
		if (!handle)
			continue;

519
520
521
		/* Record the next PFN to read in the file offset */
		*ppos = (pfn - min_low_pfn) + 1;

522
		return print_page_owner(buf, count, pfn, page,
523
				page_owner, handle);
524
525
526
527
528
	}

	return 0;
}

529
530
static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
531
532
	unsigned long pfn = zone->zone_start_pfn;
	unsigned long end_pfn = zone_end_pfn(zone);
533
534
535
536
537
538
539
540
	unsigned long count = 0;

	/*
	 * Walk the zone in pageblock_nr_pages steps. If a page block spans
	 * a zone boundary, it will be double counted between zones. This does
	 * not matter as the mixed block count will still be correct
	 */
	for (; pfn < end_pfn; ) {
541
542
		unsigned long block_end_pfn;

543
544
545
546
547
548
549
550
551
		if (!pfn_valid(pfn)) {
			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
			continue;
		}

		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
		block_end_pfn = min(block_end_pfn, end_pfn);

		for (; pfn < block_end_pfn; pfn++) {
552
553
554
			struct page *page;
			struct page_ext *page_ext;

555
556
557
558
559
			if (!pfn_valid_within(pfn))
				continue;

			page = pfn_to_page(pfn);

560
561
562
			if (page_zone(page) != zone)
				continue;

563
			/*
564
565
566
567
568
			 * To avoid having to grab zone->lock, be a little
			 * careful when reading buddy page order. The only
			 * danger is that we skip too much and potentially miss
			 * some early allocated pages, which is better than
			 * heavy lock contention.
569
570
			 */
			if (PageBuddy(page)) {
571
572
573
574
				unsigned long order = page_order_unsafe(page);

				if (order > 0 && order < MAX_ORDER)
					pfn += (1UL << order) - 1;
575
576
577
578
579
580
581
				continue;
			}

			if (PageReserved(page))
				continue;

			page_ext = lookup_page_ext(page);
582
583
			if (unlikely(!page_ext))
				continue;
584

585
			/* Maybe overlapping zone */
586
587
588
589
			if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
				continue;

			/* Found early allocated page */
590
			__set_page_owner_handle(page_ext, early_handle, 0, 0);
591
592
			count++;
		}
593
		cond_resched();
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
	}

	pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
		pgdat->node_id, zone->name, count);
}

static void init_zones_in_node(pg_data_t *pgdat)
{
	struct zone *zone;
	struct zone *node_zones = pgdat->node_zones;

	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
		if (!populated_zone(zone))
			continue;

		init_pages_in_zone(pgdat, zone);
	}
}

static void init_early_allocated_pages(void)
{
	pg_data_t *pgdat;

	for_each_online_pgdat(pgdat)
		init_zones_in_node(pgdat);
}

621
622
623
624
625
626
627
628
static const struct file_operations proc_page_owner_operations = {
	.read		= read_page_owner,
};

static int __init pageowner_init(void)
{
	struct dentry *dentry;

629
	if (!static_branch_unlikely(&page_owner_inited)) {
630
631
632
633
		pr_info("page_owner is disabled\n");
		return 0;
	}

634
635
	dentry = debugfs_create_file("page_owner", 0400, NULL,
				     NULL, &proc_page_owner_operations);
636

637
	return PTR_ERR_OR_ZERO(dentry);
638
}
639
late_initcall(pageowner_init)