compaction.c 58.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
3
4
5
6
7
8
9
10
/*
 * linux/mm/compaction.c
 *
 * Memory compaction for the reduction of external fragmentation. Note that
 * this heavily depends upon page migration to do all the real heavy
 * lifting
 *
 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
 */
11
#include <linux/cpu.h>
12
13
14
15
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
16
#include <linux/sched/signal.h>
17
#include <linux/backing-dev.h>
18
#include <linux/sysctl.h>
19
#include <linux/sysfs.h>
20
#include <linux/page-isolation.h>
21
#include <linux/kasan.h>
22
23
#include <linux/kthread.h>
#include <linux/freezer.h>
24
#include <linux/page_owner.h>
25
26
#include "internal.h"

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#ifdef CONFIG_COMPACTION
static inline void count_compact_event(enum vm_event_item item)
{
	count_vm_event(item);
}

static inline void count_compact_events(enum vm_event_item item, long delta)
{
	count_vm_events(item, delta);
}
#else
#define count_compact_event(item) do { } while (0)
#define count_compact_events(item, delta) do { } while (0)
#endif

42
43
#if defined CONFIG_COMPACTION || defined CONFIG_CMA

44
45
46
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>

47
48
49
50
51
#define block_start_pfn(pfn, order)	round_down(pfn, 1UL << (order))
#define block_end_pfn(pfn, order)	ALIGN((pfn) + 1, 1UL << (order))
#define pageblock_start_pfn(pfn)	block_start_pfn(pfn, pageblock_order)
#define pageblock_end_pfn(pfn)		block_end_pfn(pfn, pageblock_order)

52
53
54
static unsigned long release_freepages(struct list_head *freelist)
{
	struct page *page, *next;
55
	unsigned long high_pfn = 0;
56
57

	list_for_each_entry_safe(page, next, freelist, lru) {
58
		unsigned long pfn = page_to_pfn(page);
59
60
		list_del(&page->lru);
		__free_page(page);
61
62
		if (pfn > high_pfn)
			high_pfn = pfn;
63
64
	}

65
	return high_pfn;
66
67
}

68
69
static void map_pages(struct list_head *list)
{
70
71
72
73
74
75
76
77
78
79
	unsigned int i, order, nr_pages;
	struct page *page, *next;
	LIST_HEAD(tmp_list);

	list_for_each_entry_safe(page, next, list, lru) {
		list_del(&page->lru);

		order = page_private(page);
		nr_pages = 1 << order;

80
		post_alloc_hook(page, order, __GFP_MOVABLE);
81
82
		if (order)
			split_page(page, order);
83

84
85
86
87
		for (i = 0; i < nr_pages; i++) {
			list_add(&page->lru, &tmp_list);
			page++;
		}
88
	}
89
90

	list_splice(&tmp_list, list);
91
92
}

93
#ifdef CONFIG_COMPACTION
94

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
int PageMovable(struct page *page)
{
	struct address_space *mapping;

	VM_BUG_ON_PAGE(!PageLocked(page), page);
	if (!__PageMovable(page))
		return 0;

	mapping = page_mapping(page);
	if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
		return 1;

	return 0;
}
EXPORT_SYMBOL(PageMovable);

void __SetPageMovable(struct page *page, struct address_space *mapping)
{
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
	page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
}
EXPORT_SYMBOL(__SetPageMovable);

void __ClearPageMovable(struct page *page)
{
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageMovable(page), page);
	/*
	 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
	 * flag so that VM can catch up released page by driver after isolation.
	 * With it, VM migration doesn't try to put it back.
	 */
	page->mapping = (void *)((unsigned long)page->mapping &
				PAGE_MAPPING_MOVABLE);
}
EXPORT_SYMBOL(__ClearPageMovable);

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6

/*
 * Compaction is deferred when compaction fails to result in a page
 * allocation success. 1 << compact_defer_limit compactions are skipped up
 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
 */
void defer_compaction(struct zone *zone, int order)
{
	zone->compact_considered = 0;
	zone->compact_defer_shift++;

	if (order < zone->compact_order_failed)
		zone->compact_order_failed = order;

	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;

	trace_mm_compaction_defer_compaction(zone, order);
}

/* Returns true if compaction should be skipped this time */
bool compaction_deferred(struct zone *zone, int order)
{
	unsigned long defer_limit = 1UL << zone->compact_defer_shift;

	if (order < zone->compact_order_failed)
		return false;

	/* Avoid possible overflow */
	if (++zone->compact_considered > defer_limit)
		zone->compact_considered = defer_limit;

	if (zone->compact_considered >= defer_limit)
		return false;

	trace_mm_compaction_deferred(zone, order);

	return true;
}

/*
 * Update defer tracking counters after successful compaction of given order,
 * which means an allocation either succeeded (alloc_success == true) or is
 * expected to succeed.
 */
void compaction_defer_reset(struct zone *zone, int order,
		bool alloc_success)
{
	if (alloc_success) {
		zone->compact_considered = 0;
		zone->compact_defer_shift = 0;
	}
	if (order >= zone->compact_order_failed)
		zone->compact_order_failed = order + 1;

	trace_mm_compaction_defer_reset(zone, order);
}

/* Returns true if restarting compaction after many failures */
bool compaction_restarting(struct zone *zone, int order)
{
	if (order < zone->compact_order_failed)
		return false;

	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
		zone->compact_considered >= 1UL << zone->compact_defer_shift;
}

203
204
205
206
207
208
209
210
211
212
/* Returns true if the pageblock should be scanned for pages to isolate. */
static inline bool isolation_suitable(struct compact_control *cc,
					struct page *page)
{
	if (cc->ignore_skip_hint)
		return true;

	return !get_pageblock_skip(page);
}

213
214
215
216
static void reset_cached_positions(struct zone *zone)
{
	zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
	zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
217
	zone->compact_cached_free_pfn =
218
				pageblock_start_pfn(zone_end_pfn(zone) - 1);
219
220
}

221
/*
222
223
224
 * Compound pages of >= pageblock_order should consistenly be skipped until
 * released. It is always pointless to compact pages of such order (if they are
 * migratable), and the pageblocks they occupy cannot contain any free pages.
225
 */
226
static bool pageblock_skip_persistent(struct page *page)
227
{
228
	if (!PageCompound(page))
229
		return false;
230
231
232
233
234
235
236

	page = compound_head(page);

	if (compound_order(page) >= pageblock_order)
		return true;

	return false;
237
238
}

239
240
241
242
243
/*
 * This function is called to clear all cached information on pageblocks that
 * should be skipped for page isolation when the migrate and free page scanner
 * meet.
 */
244
static void __reset_isolation_suitable(struct zone *zone)
245
246
{
	unsigned long start_pfn = zone->zone_start_pfn;
247
	unsigned long end_pfn = zone_end_pfn(zone);
248
249
	unsigned long pfn;

250
	zone->compact_blockskip_flush = false;
251
252
253
254
255
256
257

	/* Walk the zone and mark every pageblock as suitable for isolation */
	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
		struct page *page;

		cond_resched();

258
259
		page = pfn_to_online_page(pfn);
		if (!page)
260
261
262
			continue;
		if (zone != page_zone(page))
			continue;
263
		if (pageblock_skip_persistent(page))
264
			continue;
265
266
267

		clear_pageblock_skip(page);
	}
268
269

	reset_cached_positions(zone);
270
271
}

272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
void reset_isolation_suitable(pg_data_t *pgdat)
{
	int zoneid;

	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
		struct zone *zone = &pgdat->node_zones[zoneid];
		if (!populated_zone(zone))
			continue;

		/* Only flush if a full compaction finished recently */
		if (zone->compact_blockskip_flush)
			__reset_isolation_suitable(zone);
	}
}

287
288
/*
 * If no pages were isolated then mark this pageblock to be skipped in the
289
 * future. The information is later cleared by __reset_isolation_suitable().
290
 */
291
292
static void update_pageblock_skip(struct compact_control *cc,
			struct page *page, unsigned long nr_isolated,
293
			bool migrate_scanner)
294
{
295
	struct zone *zone = cc->zone;
296
	unsigned long pfn;
297

298
	if (cc->no_set_skip_hint)
299
300
		return;

301
302
303
	if (!page)
		return;

304
305
306
	if (nr_isolated)
		return;

307
	set_pageblock_skip(page);
308

309
310
311
312
313
314
	pfn = page_to_pfn(page);

	/* Update where async and sync compaction should restart */
	if (migrate_scanner) {
		if (pfn > zone->compact_cached_migrate_pfn[0])
			zone->compact_cached_migrate_pfn[0] = pfn;
315
316
		if (cc->mode != MIGRATE_ASYNC &&
		    pfn > zone->compact_cached_migrate_pfn[1])
317
318
319
320
			zone->compact_cached_migrate_pfn[1] = pfn;
	} else {
		if (pfn < zone->compact_cached_free_pfn)
			zone->compact_cached_free_pfn = pfn;
321
	}
322
323
324
325
326
327
328
329
}
#else
static inline bool isolation_suitable(struct compact_control *cc,
					struct page *page)
{
	return true;
}

330
static inline bool pageblock_skip_persistent(struct page *page)
331
332
333
334
335
{
	return false;
}

static inline void update_pageblock_skip(struct compact_control *cc,
336
			struct page *page, unsigned long nr_isolated,
337
			bool migrate_scanner)
338
339
340
341
{
}
#endif /* CONFIG_COMPACTION */

342
343
344
345
346
347
348
349
350
351
/*
 * Compaction requires the taking of some coarse locks that are potentially
 * very heavily contended. For async compaction, back out if the lock cannot
 * be taken immediately. For sync compaction, spin on the lock if needed.
 *
 * Returns true if the lock is held
 * Returns false if the lock is not held and compaction should abort
 */
static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
						struct compact_control *cc)
352
{
353
354
	if (cc->mode == MIGRATE_ASYNC) {
		if (!spin_trylock_irqsave(lock, *flags)) {
355
			cc->contended = true;
356
357
358
359
360
			return false;
		}
	} else {
		spin_lock_irqsave(lock, *flags);
	}
361

362
	return true;
363
364
}

365
366
/*
 * Compaction requires the taking of some coarse locks that are potentially
367
368
369
370
371
372
373
 * very heavily contended. The lock should be periodically unlocked to avoid
 * having disabled IRQs for a long time, even when there is nobody waiting on
 * the lock. It might also be that allowing the IRQs will result in
 * need_resched() becoming true. If scheduling is needed, async compaction
 * aborts. Sync compaction schedules.
 * Either compaction type will also abort if a fatal signal is pending.
 * In either case if the lock was locked, it is dropped and not regained.
374
 *
375
376
377
378
 * Returns true if compaction should abort due to fatal signal pending, or
 *		async compaction due to need_resched()
 * Returns false when compaction can continue (sync compaction might have
 *		scheduled)
379
 */
380
381
static bool compact_unlock_should_abort(spinlock_t *lock,
		unsigned long flags, bool *locked, struct compact_control *cc)
382
{
383
384
385
386
	if (*locked) {
		spin_unlock_irqrestore(lock, flags);
		*locked = false;
	}
387

388
	if (fatal_signal_pending(current)) {
389
		cc->contended = true;
390
391
		return true;
	}
392

393
	if (need_resched()) {
394
		if (cc->mode == MIGRATE_ASYNC) {
395
			cc->contended = true;
396
			return true;
397
398
399
400
		}
		cond_resched();
	}

401
	return false;
402
403
}

404
405
406
/*
 * Aside from avoiding lock contention, compaction also periodically checks
 * need_resched() and either schedules in sync compaction or aborts async
407
 * compaction. This is similar to what compact_unlock_should_abort() does, but
408
409
410
411
412
413
414
415
416
417
 * is used where no lock is concerned.
 *
 * Returns false when no scheduling was needed, or sync compaction scheduled.
 * Returns true when async compaction should abort.
 */
static inline bool compact_should_abort(struct compact_control *cc)
{
	/* async compaction aborts if contended */
	if (need_resched()) {
		if (cc->mode == MIGRATE_ASYNC) {
418
			cc->contended = true;
419
420
421
422
423
424
425
426
427
			return true;
		}

		cond_resched();
	}

	return false;
}

428
/*
429
430
431
 * Isolate free pages onto a private freelist. If @strict is true, will abort
 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
 * (even though it may still end up isolating some pages).
432
 */
433
static unsigned long isolate_freepages_block(struct compact_control *cc,
434
				unsigned long *start_pfn,
435
436
437
				unsigned long end_pfn,
				struct list_head *freelist,
				bool strict)
438
{
439
	int nr_scanned = 0, total_isolated = 0;
440
	struct page *cursor, *valid_page = NULL;
441
	unsigned long flags = 0;
442
	bool locked = false;
443
	unsigned long blockpfn = *start_pfn;
444
	unsigned int order;
445
446
447

	cursor = pfn_to_page(blockpfn);

448
	/* Isolate free pages. */
449
	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
450
		int isolated;
451
452
		struct page *page = cursor;

453
454
455
456
457
458
459
460
461
462
		/*
		 * Periodically drop the lock (if held) regardless of its
		 * contention, to give chance to IRQs. Abort if fatal signal
		 * pending or async compaction detects need_resched()
		 */
		if (!(blockpfn % SWAP_CLUSTER_MAX)
		    && compact_unlock_should_abort(&cc->zone->lock, flags,
								&locked, cc))
			break;

463
		nr_scanned++;
464
		if (!pfn_valid_within(blockpfn))
465
466
			goto isolate_fail;

467
468
		if (!valid_page)
			valid_page = page;
469
470
471
472
473
474
475
476

		/*
		 * For compound pages such as THP and hugetlbfs, we can save
		 * potentially a lot of iterations if we skip them at once.
		 * The check is racy, but we can consider only valid values
		 * and the only danger is skipping too much.
		 */
		if (PageCompound(page)) {
477
478
			const unsigned int order = compound_order(page);

479
			if (likely(order < MAX_ORDER)) {
480
481
				blockpfn += (1UL << order) - 1;
				cursor += (1UL << order) - 1;
482
483
484
485
			}
			goto isolate_fail;
		}

486
		if (!PageBuddy(page))
487
			goto isolate_fail;
488
489

		/*
490
491
492
493
494
		 * If we already hold the lock, we can skip some rechecking.
		 * Note that if we hold the lock now, checked_pageblock was
		 * already set in some previous iteration (or strict is true),
		 * so it is correct to skip the suitable migration target
		 * recheck as well.
495
		 */
496
497
498
499
500
501
502
503
504
		if (!locked) {
			/*
			 * The zone lock must be held to isolate freepages.
			 * Unfortunately this is a very coarse lock and can be
			 * heavily contended if there are parallel allocations
			 * or parallel compactions. For async compaction do not
			 * spin on the lock and we acquire the lock as late as
			 * possible.
			 */
505
506
			locked = compact_trylock_irqsave(&cc->zone->lock,
								&flags, cc);
507
508
			if (!locked)
				break;
509

510
511
512
513
			/* Recheck this is a buddy page under lock */
			if (!PageBuddy(page))
				goto isolate_fail;
		}
514

515
516
517
		/* Found a free page, will break it into order-0 pages */
		order = page_order(page);
		isolated = __isolate_free_page(page, order);
518
519
		if (!isolated)
			break;
520
		set_page_private(page, order);
521

522
		total_isolated += isolated;
523
		cc->nr_freepages += isolated;
524
525
		list_add_tail(&page->lru, freelist);

526
527
528
		if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
			blockpfn += isolated;
			break;
529
		}
530
531
532
533
		/* Advance to the end of split page */
		blockpfn += isolated - 1;
		cursor += isolated - 1;
		continue;
534
535
536
537
538
539
540

isolate_fail:
		if (strict)
			break;
		else
			continue;

541
542
	}

543
544
545
	if (locked)
		spin_unlock_irqrestore(&cc->zone->lock, flags);

546
547
548
549
550
551
552
	/*
	 * There is a tiny chance that we have read bogus compound_order(),
	 * so be careful to not go outside of the pageblock.
	 */
	if (unlikely(blockpfn > end_pfn))
		blockpfn = end_pfn;

553
554
555
	trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
					nr_scanned, total_isolated);

556
557
558
	/* Record how far we have got within the block */
	*start_pfn = blockpfn;

559
560
561
562
563
	/*
	 * If strict isolation is requested by CMA then check that all the
	 * pages requested were isolated. If there were any failures, 0 is
	 * returned and CMA will fail.
	 */
564
	if (strict && blockpfn < end_pfn)
565
566
		total_isolated = 0;

567
568
	/* Update the pageblock-skip if the whole pageblock was scanned */
	if (blockpfn == end_pfn)
569
		update_pageblock_skip(cc, valid_page, total_isolated, false);
570

571
	cc->total_free_scanned += nr_scanned;
572
	if (total_isolated)
573
		count_compact_events(COMPACTISOLATED, total_isolated);
574
575
576
	return total_isolated;
}

577
578
/**
 * isolate_freepages_range() - isolate free pages.
579
 * @cc:        Compaction control structure.
580
581
582
583
584
585
586
587
588
589
590
 * @start_pfn: The first PFN to start isolating.
 * @end_pfn:   The one-past-last PFN.
 *
 * Non-free pages, invalid PFNs, or zone boundaries within the
 * [start_pfn, end_pfn) range are considered errors, cause function to
 * undo its actions and return zero.
 *
 * Otherwise, function returns one-past-the-last PFN of isolated page
 * (which may be greater then end_pfn if end fell in a middle of
 * a free page).
 */
591
unsigned long
592
593
isolate_freepages_range(struct compact_control *cc,
			unsigned long start_pfn, unsigned long end_pfn)
594
{
595
	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
596
597
	LIST_HEAD(freelist);

598
	pfn = start_pfn;
599
	block_start_pfn = pageblock_start_pfn(pfn);
600
601
	if (block_start_pfn < cc->zone->zone_start_pfn)
		block_start_pfn = cc->zone->zone_start_pfn;
602
	block_end_pfn = pageblock_end_pfn(pfn);
603
604

	for (; pfn < end_pfn; pfn += isolated,
605
				block_start_pfn = block_end_pfn,
606
				block_end_pfn += pageblock_nr_pages) {
607
608
		/* Protect pfn from changing by isolate_freepages_block */
		unsigned long isolate_start_pfn = pfn;
609
610
611

		block_end_pfn = min(block_end_pfn, end_pfn);

612
613
614
615
616
617
		/*
		 * pfn could pass the block_end_pfn if isolated freepage
		 * is more than pageblock order. In this case, we adjust
		 * scanning range to right one.
		 */
		if (pfn >= block_end_pfn) {
618
619
			block_start_pfn = pageblock_start_pfn(pfn);
			block_end_pfn = pageblock_end_pfn(pfn);
620
621
622
			block_end_pfn = min(block_end_pfn, end_pfn);
		}

623
624
		if (!pageblock_pfn_to_page(block_start_pfn,
					block_end_pfn, cc->zone))
625
626
			break;

627
628
		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
						block_end_pfn, &freelist, true);
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644

		/*
		 * In strict mode, isolate_freepages_block() returns 0 if
		 * there are any holes in the block (ie. invalid PFNs or
		 * non-free pages).
		 */
		if (!isolated)
			break;

		/*
		 * If we managed to isolate pages, it is always (1 << n) *
		 * pageblock_nr_pages for some non-negative n.  (Max order
		 * page may span two pageblocks).
		 */
	}

645
	/* __isolate_free_page() does not map the pages */
646
647
648
649
650
651
652
653
654
655
656
657
	map_pages(&freelist);

	if (pfn < end_pfn) {
		/* Loop terminated early, cleanup. */
		release_freepages(&freelist);
		return 0;
	}

	/* We don't use freelists for anything. */
	return pfn;
}

658
659
660
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
661
	unsigned long active, inactive, isolated;
662

663
664
665
666
667
668
	inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
			node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
	active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
			node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
	isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
			node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
669

670
	return isolated > (inactive + active) / 2;
671
672
}

673
/**
674
675
 * isolate_migratepages_block() - isolate all migrate-able pages within
 *				  a single pageblock
676
 * @cc:		Compaction control structure.
677
678
679
 * @low_pfn:	The first PFN to isolate
 * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
 * @isolate_mode: Isolation mode to be used.
680
681
 *
 * Isolate all pages that can be migrated from the range specified by
682
683
684
685
 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
 * Returns zero if there is a fatal signal pending, otherwise PFN of the
 * first page that was not scanned (which may be both less, equal to or more
 * than end_pfn).
686
 *
687
688
689
 * The pages are isolated on cc->migratepages list (not required to be empty),
 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
 * is neither read nor updated.
690
 */
691
692
693
static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
			unsigned long end_pfn, isolate_mode_t isolate_mode)
694
{
695
	struct zone *zone = cc->zone;
696
	unsigned long nr_scanned = 0, nr_isolated = 0;
697
	struct lruvec *lruvec;
698
	unsigned long flags = 0;
699
	bool locked = false;
700
	struct page *page = NULL, *valid_page = NULL;
701
	unsigned long start_pfn = low_pfn;
702
703
	bool skip_on_failure = false;
	unsigned long next_skip_pfn = 0;
704
705
706
707
708
709
710

	/*
	 * Ensure that there are not too many pages isolated from the LRU
	 * list by either parallel reclaimers or compaction. If there are,
	 * delay for some time until fewer pages are isolated
	 */
	while (unlikely(too_many_isolated(zone))) {
711
		/* async migration should just abort */
712
		if (cc->mode == MIGRATE_ASYNC)
713
			return 0;
714

715
716
717
		congestion_wait(BLK_RW_ASYNC, HZ/10);

		if (fatal_signal_pending(current))
718
			return 0;
719
720
	}

721
722
	if (compact_should_abort(cc))
		return 0;
723

724
725
726
727
728
	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
		skip_on_failure = true;
		next_skip_pfn = block_end_pfn(low_pfn, cc->order);
	}

729
730
	/* Time to isolate some pages for migration */
	for (; low_pfn < end_pfn; low_pfn++) {
731

732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
		if (skip_on_failure && low_pfn >= next_skip_pfn) {
			/*
			 * We have isolated all migration candidates in the
			 * previous order-aligned block, and did not skip it due
			 * to failure. We should migrate the pages now and
			 * hopefully succeed compaction.
			 */
			if (nr_isolated)
				break;

			/*
			 * We failed to isolate in the previous order-aligned
			 * block. Set the new boundary to the end of the
			 * current block. Note we can't simply increase
			 * next_skip_pfn by 1 << order, as low_pfn might have
			 * been incremented by a higher number due to skipping
			 * a compound or a high-order buddy page in the
			 * previous loop iteration.
			 */
			next_skip_pfn = block_end_pfn(low_pfn, cc->order);
		}

754
755
756
757
758
759
		/*
		 * Periodically drop the lock (if held) regardless of its
		 * contention, to give chance to IRQs. Abort async compaction
		 * if contended.
		 */
		if (!(low_pfn % SWAP_CLUSTER_MAX)
760
		    && compact_unlock_should_abort(zone_lru_lock(zone), flags,
761
762
								&locked, cc))
			break;
763

764
		if (!pfn_valid_within(low_pfn))
765
			goto isolate_fail;
766
		nr_scanned++;
767
768

		page = pfn_to_page(low_pfn);
769

770
771
772
		if (!valid_page)
			valid_page = page;

773
		/*
774
775
776
777
		 * Skip if free. We read page order here without zone lock
		 * which is generally unsafe, but the race window is small and
		 * the worst thing that can happen is that we skip some
		 * potential isolation targets.
778
		 */
779
780
781
782
783
784
785
786
787
788
		if (PageBuddy(page)) {
			unsigned long freepage_order = page_order_unsafe(page);

			/*
			 * Without lock, we cannot be sure that what we got is
			 * a valid page order. Consider only values in the
			 * valid order range to prevent low_pfn overflow.
			 */
			if (freepage_order > 0 && freepage_order < MAX_ORDER)
				low_pfn += (1UL << freepage_order) - 1;
789
			continue;
790
		}
791

792
		/*
793
794
795
796
797
		 * Regardless of being on LRU, compound pages such as THP and
		 * hugetlbfs are not to be compacted. We can potentially save
		 * a lot of iterations if we skip them at once. The check is
		 * racy, but we can consider only valid values and the only
		 * danger is skipping too much.
798
		 */
799
		if (PageCompound(page)) {
800
			const unsigned int order = compound_order(page);
801

802
			if (likely(order < MAX_ORDER))
803
				low_pfn += (1UL << order) - 1;
804
			goto isolate_fail;
805
806
		}

807
808
809
810
811
812
813
814
815
816
817
818
819
		/*
		 * Check may be lockless but that's ok as we recheck later.
		 * It's possible to migrate LRU and non-lru movable pages.
		 * Skip any other type of page
		 */
		if (!PageLRU(page)) {
			/*
			 * __PageMovable can return false positive so we need
			 * to verify it under page_lock.
			 */
			if (unlikely(__PageMovable(page)) &&
					!PageIsolated(page)) {
				if (locked) {
820
					spin_unlock_irqrestore(zone_lru_lock(zone),
821
822
823
824
									flags);
					locked = false;
				}

825
				if (!isolate_movable_page(page, isolate_mode))
826
827
828
					goto isolate_success;
			}

829
			goto isolate_fail;
830
		}
831

832
833
834
835
836
837
838
		/*
		 * Migration will fail if an anonymous page is pinned in memory,
		 * so avoid taking lru_lock and isolating it unnecessarily in an
		 * admittedly racy check.
		 */
		if (!page_mapping(page) &&
		    page_count(page) > page_mapcount(page))
839
			goto isolate_fail;
840

841
842
843
844
845
846
847
		/*
		 * Only allow to migrate anonymous pages in GFP_NOFS context
		 * because those do not depend on fs locks.
		 */
		if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
			goto isolate_fail;

848
849
		/* If we already hold the lock, we can skip some rechecking */
		if (!locked) {
850
			locked = compact_trylock_irqsave(zone_lru_lock(zone),
851
								&flags, cc);
852
853
			if (!locked)
				break;
854

855
			/* Recheck PageLRU and PageCompound under lock */
856
			if (!PageLRU(page))
857
				goto isolate_fail;
858
859
860
861
862
863
864

			/*
			 * Page become compound since the non-locked check,
			 * and it's on LRU. It can only be a THP so the order
			 * is safe to read and it's 0 for tail pages.
			 */
			if (unlikely(PageCompound(page))) {
865
				low_pfn += (1UL << compound_order(page)) - 1;
866
				goto isolate_fail;
867
			}
868
869
		}

870
		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
871

872
		/* Try isolate the page */
873
		if (__isolate_lru_page(page, isolate_mode) != 0)
874
			goto isolate_fail;
875

876
		VM_BUG_ON_PAGE(PageCompound(page), page);
877

878
		/* Successfully isolated */
879
		del_page_from_lru_list(page, lruvec, page_lru(page));
880
881
		inc_node_page_state(page,
				NR_ISOLATED_ANON + page_is_file_cache(page));
882
883

isolate_success:
884
		list_add(&page->lru, &cc->migratepages);
885
		cc->nr_migratepages++;
886
		nr_isolated++;
887

888
889
890
891
892
893
894
895
896
		/*
		 * Record where we could have freed pages by migration and not
		 * yet flushed them to buddy allocator.
		 * - this is the lowest page that was isolated and likely be
		 * then freed by migration.
		 */
		if (!cc->last_migrated_pfn)
			cc->last_migrated_pfn = low_pfn;

897
		/* Avoid isolating too much */
898
899
		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
			++low_pfn;
900
			break;
901
		}
902
903
904
905
906
907
908
909
910
911
912
913
914

		continue;
isolate_fail:
		if (!skip_on_failure)
			continue;

		/*
		 * We have isolated some pages, but then failed. Release them
		 * instead of migrating, as we cannot form the cc->order buddy
		 * page anyway.
		 */
		if (nr_isolated) {
			if (locked) {
915
				spin_unlock_irqrestore(zone_lru_lock(zone), flags);
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
				locked = false;
			}
			putback_movable_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
			cc->last_migrated_pfn = 0;
			nr_isolated = 0;
		}

		if (low_pfn < next_skip_pfn) {
			low_pfn = next_skip_pfn - 1;
			/*
			 * The check near the loop beginning would have updated
			 * next_skip_pfn too, but this is a bit simpler.
			 */
			next_skip_pfn += 1UL << cc->order;
		}
932
933
	}

934
935
936
937
938
939
940
	/*
	 * The PageBuddy() check could have potentially brought us outside
	 * the range to be scanned.
	 */
	if (unlikely(low_pfn > end_pfn))
		low_pfn = end_pfn;

941
	if (locked)
942
		spin_unlock_irqrestore(zone_lru_lock(zone), flags);
943

944
945
946
947
	/*
	 * Update the pageblock-skip information and cached scanner pfn,
	 * if the whole pageblock was scanned without isolating any page.
	 */
948
	if (low_pfn == end_pfn)
949
		update_pageblock_skip(cc, valid_page, nr_isolated, true);
950

951
952
	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
						nr_scanned, nr_isolated);
953

954
	cc->total_migrate_scanned += nr_scanned;
955
	if (nr_isolated)
956
		count_compact_events(COMPACTISOLATED, nr_isolated);
957

958
959
960
	return low_pfn;
}

961
962
963
964
965
966
967
968
969
970
971
972
973
974
/**
 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
 * @cc:        Compaction control structure.
 * @start_pfn: The first PFN to start isolating.
 * @end_pfn:   The one-past-last PFN.
 *
 * Returns zero if isolation fails fatally due to e.g. pending signal.
 * Otherwise, function returns one-past-the-last PFN of isolated page
 * (which may be greater than end_pfn if end fell in a middle of a THP page).
 */
unsigned long
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
							unsigned long end_pfn)
{
975
	unsigned long pfn, block_start_pfn, block_end_pfn;
976
977
978

	/* Scan block by block. First and last block may be incomplete */
	pfn = start_pfn;
979
	block_start_pfn = pageblock_start_pfn(pfn);
980
981
	if (block_start_pfn < cc->zone->zone_start_pfn)
		block_start_pfn = cc->zone->zone_start_pfn;
982
	block_end_pfn = pageblock_end_pfn(pfn);
983
984

	for (; pfn < end_pfn; pfn = block_end_pfn,
985
				block_start_pfn = block_end_pfn,
986
987
988
989
				block_end_pfn += pageblock_nr_pages) {

		block_end_pfn = min(block_end_pfn, end_pfn);

990
991
		if (!pageblock_pfn_to_page(block_start_pfn,
					block_end_pfn, cc->zone))
992
993
994
995
996
			continue;

		pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
							ISOLATE_UNEVICTABLE);

997
		if (!pfn)
998
			break;
999
1000
1001

		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
			break;
1002
1003
1004
1005
1006
	}

	return pfn;
}

1007
1008
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
1009

1010
1011
1012
static bool suitable_migration_source(struct compact_control *cc,
							struct page *page)
{
1013
1014
1015
	int block_mt;

	if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
1016
1017
		return true;

1018
1019
1020
1021
1022
1023
	block_mt = get_pageblock_migratetype(page);

	if (cc->migratetype == MIGRATE_MOVABLE)
		return is_migrate_movable(block_mt);
	else
		return block_mt == cc->migratetype;
1024
1025
}

1026
/* Returns true if the page is within a block suitable for migration to */
1027
1028
static bool suitable_migration_target(struct compact_control *cc,
							struct page *page)
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
{
	/* If the page is a large free page, then disallow migration */
	if (PageBuddy(page)) {
		/*
		 * We are checking page_order without zone->lock taken. But
		 * the only small danger is that we skip a potentially suitable
		 * pageblock, so it's not worth to check order for valid range.
		 */
		if (page_order_unsafe(page) >= pageblock_order)
			return false;
	}

1041
1042
1043
	if (cc->ignore_block_suitable)
		return true;

1044
	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
1045
	if (is_migrate_movable(get_pageblock_migratetype(page)))
1046
1047
1048
1049
1050
1051
		return true;

	/* Otherwise skip the block */
	return false;
}

1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
/*
 * Test whether the free scanner has reached the same or lower pageblock than
 * the migration scanner, and compaction should thus terminate.
 */
static inline bool compact_scanners_met(struct compact_control *cc)
{
	return (cc->free_pfn >> pageblock_order)
		<= (cc->migrate_pfn >> pageblock_order);
}

1062
/*
1063
1064
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
1065
 */
1066
static void isolate_freepages(struct compact_control *cc)
1067
{
1068
	struct zone *zone = cc->zone;
1069
	struct page *page;
1070
	unsigned long block_start_pfn;	/* start of current pageblock */
1071
	unsigned long isolate_start_pfn; /* exact pfn we start at */
1072
1073
	unsigned long block_end_pfn;	/* end of current pageblock */
	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
1074
	struct list_head *freelist = &cc->freepages;
1075

1076
1077
	/*
	 * Initialise the free scanner. The starting point is where we last
1078
	 * successfully isolated from, zone-cached value, or the end of the
1079
1080
	 * zone when isolating for the first time. For looping we also need
	 * this pfn aligned down to the pageblock boundary, because we do
1081
1082
1083
	 * block_start_pfn -= pageblock_nr_pages in the for loop.
	 * For ending point, take care when isolating in last pageblock of a
	 * a zone which ends in the middle of a pageblock.
1084
1085
	 * The low boundary is the end of the pageblock the migration scanner
	 * is using.
1086
	 */
1087
	isolate_start_pfn = cc->free_pfn;
1088
	block_start_pfn = pageblock_start_pfn(cc->free_pfn);
1089
1090
	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
						zone_end_pfn(zone));
1091
	low_pfn = pageblock_end_pfn(cc->migrate_pfn);
1092

1093
1094
1095
1096
1097
	/*
	 * Isolate free pages until enough are available to migrate the
	 * pages on cc->migratepages. We stop searching if the migrate
	 * and free page scanners meet or enough free pages are isolated.
	 */
1098
	for (; block_start_pfn >= low_pfn;
1099
				block_end_pfn = block_start_pfn,
1100
1101
				block_start_pfn -= pageblock_nr_pages,
				isolate_start_pfn = block_start_pfn) {
1102
1103
1104
		/*
		 * This can iterate a massively long zone without finding any
		 * suitable migration targets, so periodically check if we need
1105
		 * to schedule, or even abort async compaction.
1106
		 */
1107
1108
1109
		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
						&& compact_should_abort(cc))
			break;
1110

1111
1112
1113
		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
									zone);
		if (!page)
1114
1115
1116
			continue;

		/* Check the block is suitable for migration */
1117
		if (!suitable_migration_target(cc, page))
Michal Nazarewicz's avatar