swap_state.c 24 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2
3
4
5
6
7
8
9
10
/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/mm.h>
11
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
12
13
#include <linux/kernel_stat.h>
#include <linux/swap.h>
14
#include <linux/swapops.h>
Linus Torvalds's avatar
Linus Torvalds committed
15
16
17
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
18
#include <linux/blkdev.h>
19
#include <linux/pagevec.h>
20
#include <linux/migrate.h>
21
#include <linux/vmalloc.h>
22
#include <linux/swap_slots.h>
23
#include <linux/huge_mm.h>
24
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
25
26
27

/*
 * swapper_space is a fiction, retained to simplify the path through
Jens Axboe's avatar
Jens Axboe committed
28
 * vmscan's shrink_page_list.
Linus Torvalds's avatar
Linus Torvalds committed
29
 */
30
static const struct address_space_operations swap_aops = {
Linus Torvalds's avatar
Linus Torvalds committed
31
	.writepage	= swap_writepage,
32
	.set_page_dirty	= swap_set_page_dirty,
33
#ifdef CONFIG_MIGRATION
34
	.migratepage	= migrate_page,
35
#endif
Linus Torvalds's avatar
Linus Torvalds committed
36
37
};

38
39
struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
40
static bool enable_vma_readahead __read_mostly = true;
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

#define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
#define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
#define SWAP_RA_WIN_MASK	(~PAGE_MASK & ~SWAP_RA_HITS_MASK)

#define SWAP_RA_HITS(v)		((v) & SWAP_RA_HITS_MASK)
#define SWAP_RA_WIN(v)		(((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
#define SWAP_RA_ADDR(v)		((v) & PAGE_MASK)

#define SWAP_RA_VAL(addr, win, hits)				\
	(((addr) & PAGE_MASK) |					\
	 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |	\
	 ((hits) & SWAP_RA_HITS_MASK))

/* Initial readahead hits is 4 to start up with a small window */
#define GET_SWAP_RA_VAL(vma)					\
	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
Linus Torvalds's avatar
Linus Torvalds committed
59

60
61
#define INC_CACHE_INFO(x)	data_race(swap_cache_info.x++)
#define ADD_CACHE_INFO(x, nr)	data_race(swap_cache_info.x += (nr))
Linus Torvalds's avatar
Linus Torvalds committed
62
63
64
65
66
67
68
69

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
} swap_cache_info;

70
71
unsigned long total_swapcache_pages(void)
{
72
	unsigned int i, j, nr;
73
	unsigned long ret = 0;
74
	struct address_space *spaces;
75
	struct swap_info_struct *si;
76

77
	for (i = 0; i < MAX_SWAPFILES; i++) {
78
79
80
81
82
83
84
85
		swp_entry_t entry = swp_entry(i, 1);

		/* Avoid get_swap_device() to warn for bad swap entry */
		if (!swp_swap_info(entry))
			continue;
		/* Prevent swapoff to free swapper_spaces */
		si = get_swap_device(entry);
		if (!si)
86
			continue;
87
88
		nr = nr_swapper_spaces[i];
		spaces = swapper_spaces[i];
89
90
		for (j = 0; j < nr; j++)
			ret += spaces[j].nrpages;
91
		put_swap_device(si);
92
	}
93
94
95
	return ret;
}

96
97
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);

Linus Torvalds's avatar
Linus Torvalds committed
98
99
void show_swap_cache_info(void)
{
100
	printk("%lu pages in swap cache\n", total_swapcache_pages());
101
	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
Linus Torvalds's avatar
Linus Torvalds committed
102
		swap_cache_info.add_total, swap_cache_info.del_total,
103
		swap_cache_info.find_success, swap_cache_info.find_total);
104
105
	printk("Free swap  = %ldkB\n",
		get_nr_swap_pages() << (PAGE_SHIFT - 10));
Linus Torvalds's avatar
Linus Torvalds committed
106
107
108
	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

109
110
111
112
113
114
115
116
117
118
119
120
121
122
void *get_shadow_from_swap_cache(swp_entry_t entry)
{
	struct address_space *address_space = swap_address_space(entry);
	pgoff_t idx = swp_offset(entry);
	struct page *page;

	page = find_get_entry(address_space, idx);
	if (xa_is_value(page))
		return page;
	if (page)
		put_page(page);
	return NULL;
}

Linus Torvalds's avatar
Linus Torvalds committed
123
/*
124
 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
Linus Torvalds's avatar
Linus Torvalds committed
125
126
 * but sets SwapCache flag and private instead of mapping and index.
 */
127
128
int add_to_swap_cache(struct page *page, swp_entry_t entry,
			gfp_t gfp, void **shadowp)
Linus Torvalds's avatar
Linus Torvalds committed
129
{
130
	struct address_space *address_space = swap_address_space(entry);
131
	pgoff_t idx = swp_offset(entry);
132
	XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
133
	unsigned long i, nr = thp_nr_pages(page);
134
	void *old;
Linus Torvalds's avatar
Linus Torvalds committed
135

136
137
138
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(PageSwapCache(page), page);
	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
139

140
	page_ref_add(page, nr);
141
142
	SetPageSwapCache(page);

143
	do {
144
145
		unsigned long nr_shadows = 0;

146
147
148
149
150
151
		xas_lock_irq(&xas);
		xas_create_range(&xas);
		if (xas_error(&xas))
			goto unlock;
		for (i = 0; i < nr; i++) {
			VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
152
153
154
155
156
157
			old = xas_load(&xas);
			if (xa_is_value(old)) {
				nr_shadows++;
				if (shadowp)
					*shadowp = old;
			}
158
			set_page_private(page + i, entry.val + i);
159
			xas_store(&xas, page);
160
161
			xas_next(&xas);
		}
162
		address_space->nrexceptional -= nr_shadows;
163
164
165
		address_space->nrpages += nr;
		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
		ADD_CACHE_INFO(add_total, nr);
166
167
168
unlock:
		xas_unlock_irq(&xas);
	} while (xas_nomem(&xas, gfp));
169

170
171
	if (!xas_error(&xas))
		return 0;
172

173
174
175
	ClearPageSwapCache(page);
	page_ref_sub(page, nr);
	return xas_error(&xas);
Linus Torvalds's avatar
Linus Torvalds committed
176
177
178
179
180
181
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
182
183
void __delete_from_swap_cache(struct page *page,
			swp_entry_t entry, void *shadow)
Linus Torvalds's avatar
Linus Torvalds committed
184
{
185
	struct address_space *address_space = swap_address_space(entry);
186
	int i, nr = thp_nr_pages(page);
187
188
	pgoff_t idx = swp_offset(entry);
	XA_STATE(xas, &address_space->i_pages, idx);
189

190
191
192
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
	VM_BUG_ON_PAGE(PageWriteback(page), page);
Linus Torvalds's avatar
Linus Torvalds committed
193

194
	for (i = 0; i < nr; i++) {
195
		void *entry = xas_store(&xas, shadow);
196
		VM_BUG_ON_PAGE(entry != page, entry);
197
		set_page_private(page + i, 0);
198
		xas_next(&xas);
199
	}
Linus Torvalds's avatar
Linus Torvalds committed
200
	ClearPageSwapCache(page);
201
202
	if (shadow)
		address_space->nrexceptional += nr;
203
204
205
	address_space->nrpages -= nr;
	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
	ADD_CACHE_INFO(del_total, nr);
Linus Torvalds's avatar
Linus Torvalds committed
206
207
208
209
210
211
212
213
214
}

/**
 * add_to_swap - allocate swap space for a page
 * @page: page we want to move to swap
 *
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
215
int add_to_swap(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
216
217
218
219
{
	swp_entry_t entry;
	int err;

220
221
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageUptodate(page), page);
Linus Torvalds's avatar
Linus Torvalds committed
222

223
	entry = get_swap_page(page);
224
	if (!entry.val)
225
226
		return 0;

227
	/*
228
	 * XArray node allocations from PF_MEMALLOC contexts could
229
230
231
232
233
234
235
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	/*
Minchan Kim's avatar
Minchan Kim committed
236
	 * Add it to the swap cache.
237
238
	 */
	err = add_to_swap_cache(page, entry,
239
			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
240
	if (err)
Nick Piggin's avatar
Nick Piggin committed
241
		/*
242
243
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
Linus Torvalds's avatar
Linus Torvalds committed
244
		 */
245
		goto fail;
246
247
248
249
250
251
252
253
254
255
256
	/*
	 * Normally the page will be dirtied in unmap because its pte should be
	 * dirty. A special case is MADV_FREE page. The page'e pte could have
	 * dirty bit cleared but the page's SwapBacked bit is still set because
	 * clearing the dirty bit and SwapBacked bit has no lock protected. For
	 * such page, unmap will not set dirty bit for it, so page reclaim will
	 * not write the page out. This can cause data corruption when the page
	 * is swap in later. Always setting the dirty bit for the page solves
	 * the problem.
	 */
	set_page_dirty(page);
257
258
259
260

	return 1;

fail:
261
	put_swap_page(page, entry);
262
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
263
264
265
266
267
268
269
270
271
272
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
273
274
	swp_entry_t entry = { .val = page_private(page) };
	struct address_space *address_space = swap_address_space(entry);
Linus Torvalds's avatar
Linus Torvalds committed
275

Matthew Wilcox's avatar
Matthew Wilcox committed
276
	xa_lock_irq(&address_space->i_pages);
277
	__delete_from_swap_cache(page, entry, NULL);
Matthew Wilcox's avatar
Matthew Wilcox committed
278
	xa_unlock_irq(&address_space->i_pages);
Linus Torvalds's avatar
Linus Torvalds committed
279

280
	put_swap_page(page, entry);
281
	page_ref_sub(page, thp_nr_pages(page));
Linus Torvalds's avatar
Linus Torvalds committed
282
283
}

284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
void clear_shadow_from_swap_cache(int type, unsigned long begin,
				unsigned long end)
{
	unsigned long curr = begin;
	void *old;

	for (;;) {
		unsigned long nr_shadows = 0;
		swp_entry_t entry = swp_entry(type, curr);
		struct address_space *address_space = swap_address_space(entry);
		XA_STATE(xas, &address_space->i_pages, curr);

		xa_lock_irq(&address_space->i_pages);
		xas_for_each(&xas, old, end) {
			if (!xa_is_value(old))
				continue;
			xas_store(&xas, NULL);
			nr_shadows++;
		}
		address_space->nrexceptional -= nr_shadows;
		xa_unlock_irq(&address_space->i_pages);

		/* search the next swapcache until we meet end */
		curr >>= SWAP_ADDRESS_SPACE_SHIFT;
		curr++;
		curr <<= SWAP_ADDRESS_SPACE_SHIFT;
		if (curr > end)
			break;
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
315
316
317
318
/* 
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
319
320
 * here because we are going to recheck again inside
 * try_to_free_swap() _with_ the lock.
Linus Torvalds's avatar
Linus Torvalds committed
321
322
323
324
 * 					- Marcelo
 */
static inline void free_swap_cache(struct page *page)
{
325
326
	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
		try_to_free_swap(page);
Linus Torvalds's avatar
Linus Torvalds committed
327
328
329
330
331
332
		unlock_page(page);
	}
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
333
 * this page if it is the last user of the page.
Linus Torvalds's avatar
Linus Torvalds committed
334
335
336
337
 */
void free_page_and_swap_cache(struct page *page)
{
	free_swap_cache(page);
338
	if (!is_huge_zero_page(page))
339
		put_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
340
341
342
343
344
345
346
347
348
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct page **pages, int nr)
{
	struct page **pagep = pages;
349
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
350
351

	lru_add_drain();
352
353
	for (i = 0; i < nr; i++)
		free_swap_cache(pagep[i]);
354
	release_pages(pagep, nr);
Linus Torvalds's avatar
Linus Torvalds committed
355
356
}

357
358
359
360
361
static inline bool swap_use_vma_readahead(void)
{
	return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
}

Linus Torvalds's avatar
Linus Torvalds committed
362
363
364
365
366
367
/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
368
369
struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
			       unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
370
371
{
	struct page *page;
372
	struct swap_info_struct *si;
Linus Torvalds's avatar
Linus Torvalds committed
373

374
375
376
	si = get_swap_device(entry);
	if (!si)
		return NULL;
377
	page = find_get_page(swap_address_space(entry), swp_offset(entry));
378
	put_swap_device(si);
Linus Torvalds's avatar
Linus Torvalds committed
379

380
381
	INC_CACHE_INFO(find_total);
	if (page) {
382
383
384
		bool vma_ra = swap_use_vma_readahead();
		bool readahead;

Linus Torvalds's avatar
Linus Torvalds committed
385
		INC_CACHE_INFO(find_success);
386
387
388
389
		/*
		 * At the moment, we don't support PG_readahead for anon THP
		 * so let's bail out rather than confusing the readahead stat.
		 */
390
391
		if (unlikely(PageTransCompound(page)))
			return page;
392

393
		readahead = TestClearPageReadahead(page);
394
395
396
397
398
399
400
		if (vma && vma_ra) {
			unsigned long ra_val;
			int win, hits;

			ra_val = GET_SWAP_RA_VAL(vma);
			win = SWAP_RA_WIN(ra_val);
			hits = SWAP_RA_HITS(ra_val);
401
402
403
404
405
			if (readahead)
				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
			atomic_long_set(&vma->swap_readahead_info,
					SWAP_RA_VAL(addr, win, hits));
		}
406

407
		if (readahead) {
408
			count_vm_event(SWAP_RA_HIT);
409
			if (!vma || !vma_ra)
410
				atomic_inc(&swapin_readahead_hits);
411
		}
412
	}
413

Linus Torvalds's avatar
Linus Torvalds committed
414
415
416
	return page;
}

417
418
419
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr,
			bool *new_page_allocated)
Linus Torvalds's avatar
Linus Torvalds committed
420
{
421
	struct swap_info_struct *si;
422
	struct page *page;
423
	void *shadow = NULL;
424

425
	*new_page_allocated = false;
Linus Torvalds's avatar
Linus Torvalds committed
426

427
428
	for (;;) {
		int err;
Linus Torvalds's avatar
Linus Torvalds committed
429
430
431
432
433
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
434
435
		si = get_swap_device(entry);
		if (!si)
436
437
438
			return NULL;
		page = find_get_page(swap_address_space(entry),
				     swp_offset(entry));
439
		put_swap_device(si);
440
441
		if (page)
			return page;
Linus Torvalds's avatar
Linus Torvalds committed
442

443
444
445
446
447
448
449
450
451
		/*
		 * Just skip read ahead for unused swap slot.
		 * During swap_off when swap_slot_cache is disabled,
		 * we have to handle the race between putting
		 * swap entry in swap cache and marking swap slot
		 * as SWAP_HAS_CACHE.  That's done in later part of code or
		 * else swap_off will be aborted if we return NULL.
		 */
		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
452
			return NULL;
453

Linus Torvalds's avatar
Linus Torvalds committed
454
		/*
455
456
457
		 * Get a new page to read into from swap.  Allocate it now,
		 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
		 * cause any racers to loop around until we add it to cache.
Linus Torvalds's avatar
Linus Torvalds committed
458
		 */
459
460
461
		page = alloc_page_vma(gfp_mask, vma, addr);
		if (!page)
			return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
462

463
464
465
		/*
		 * Swap entry may have been freed since our caller observed it.
		 */
466
		err = swapcache_prepare(entry);
467
		if (!err)
468
469
			break;

470
471
472
473
		put_page(page);
		if (err != -EEXIST)
			return NULL;

474
		/*
475
476
477
478
479
		 * We might race against __delete_from_swap_cache(), and
		 * stumble across a swap_map entry whose SWAP_HAS_CACHE
		 * has not yet been cleared.  Or race against another
		 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
		 * in swap_map, but not yet added its page to swap cache.
480
		 */
481
482
483
484
485
486
487
488
489
490
491
		cond_resched();
	}

	/*
	 * The swap entry is ours to swap in. Prepare the new page.
	 */

	__SetPageLocked(page);
	__SetPageSwapBacked(page);

	/* May fail (-ENOMEM) if XArray node allocation failed. */
492
	if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
493
494
495
496
		put_swap_page(page, entry);
		goto fail_unlock;
	}

497
	if (mem_cgroup_charge(page, NULL, gfp_mask)) {
498
499
500
501
		delete_from_swap_cache(page);
		goto fail_unlock;
	}

502
503
	if (shadow)
		workingset_refault(page, shadow);
504

505
506
	/* Caller will initiate read into locked page */
	SetPageWorkingset(page);
507
	lru_cache_add(page);
508
509
	*new_page_allocated = true;
	return page;
Linus Torvalds's avatar
Linus Torvalds committed
510

511
512
513
514
fail_unlock:
	unlock_page(page);
	put_page(page);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
515
}
516

517
518
519
520
521
522
523
/*
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
524
		struct vm_area_struct *vma, unsigned long addr, bool do_poll)
525
526
527
528
529
530
{
	bool page_was_allocated;
	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
			vma, addr, &page_was_allocated);

	if (page_was_allocated)
531
		swap_readpage(retpage, do_poll);
532
533
534
535

	return retpage;
}

536
537
538
539
540
static unsigned int __swapin_nr_pages(unsigned long prev_offset,
				      unsigned long offset,
				      int hits,
				      int max_pages,
				      int prev_win)
541
{
542
	unsigned int pages, last_ra;
543
544
545
546
547
548

	/*
	 * This heuristic has been found to work well on both sequential and
	 * random loads, swapping to hard disk or to SSD: please don't ask
	 * what the "+ 2" means, it just happens to work well, that's all.
	 */
549
	pages = hits + 2;
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
	if (pages == 2) {
		/*
		 * We can have no readahead hits to judge by: but must not get
		 * stuck here forever, so check for an adjacent offset instead
		 * (and don't even bother to check whether swap type is same).
		 */
		if (offset != prev_offset + 1 && offset != prev_offset - 1)
			pages = 1;
	} else {
		unsigned int roundup = 4;
		while (roundup < pages)
			roundup <<= 1;
		pages = roundup;
	}

	if (pages > max_pages)
		pages = max_pages;

	/* Don't shrink readahead too fast */
569
	last_ra = prev_win / 2;
570
571
	if (pages < last_ra)
		pages = last_ra;
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586

	return pages;
}

static unsigned long swapin_nr_pages(unsigned long offset)
{
	static unsigned long prev_offset;
	unsigned int hits, pages, max_pages;
	static atomic_t last_readahead_pages;

	max_pages = 1 << READ_ONCE(page_cluster);
	if (max_pages <= 1)
		return 1;

	hits = atomic_xchg(&swapin_readahead_hits, 0);
587
588
	pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
				  max_pages,
589
590
				  atomic_read(&last_readahead_pages));
	if (!hits)
591
		WRITE_ONCE(prev_offset, offset);
592
593
594
595
596
	atomic_set(&last_readahead_pages, pages);

	return pages;
}

597
/**
598
 * swap_cluster_readahead - swap in pages in hope we need them soon
599
 * @entry: swap entry of this memory
600
 * @gfp_mask: memory allocation flags
601
 * @vmf: fault information
602
603
604
605
606
607
608
609
610
611
612
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * This has been extended to use the NUMA policies from the mm triggering
 * the readahead.
 *
613
 * Caller must hold read mmap_lock if vmf->vma is not NULL.
614
 */
615
616
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
				struct vm_fault *vmf)
617
618
{
	struct page *page;
619
620
	unsigned long entry_offset = swp_offset(entry);
	unsigned long offset = entry_offset;
621
	unsigned long start_offset, end_offset;
622
	unsigned long mask;
623
	struct swap_info_struct *si = swp_swap_info(entry);
624
	struct blk_plug plug;
625
	bool do_poll = true, page_allocated;
626
627
	struct vm_area_struct *vma = vmf->vma;
	unsigned long addr = vmf->address;
628

629
630
631
632
	mask = swapin_nr_pages(offset) - 1;
	if (!mask)
		goto skip;

633
634
635
636
637
638
639
	/* Test swap type to make sure the dereference is safe */
	if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
		struct inode *inode = si->swap_file->f_mapping->host;
		if (inode_read_congested(inode))
			goto skip;
	}

640
	do_poll = false;
641
642
643
644
645
	/* Read a page_cluster sized and aligned cluster around offset. */
	start_offset = offset & ~mask;
	end_offset = offset | mask;
	if (!start_offset)	/* First page is swap header. */
		start_offset++;
646
647
	if (end_offset >= si->max)
		end_offset = si->max - 1;
648

649
	blk_start_plug(&plug);
650
	for (offset = start_offset; offset <= end_offset ; offset++) {
651
		/* Ok, do the async read-ahead now */
652
653
654
		page = __read_swap_cache_async(
			swp_entry(swp_type(entry), offset),
			gfp_mask, vma, addr, &page_allocated);
655
		if (!page)
656
			continue;
657
658
		if (page_allocated) {
			swap_readpage(page, false);
659
			if (offset != entry_offset) {
660
661
662
				SetPageReadahead(page);
				count_vm_event(SWAP_RA);
			}
663
		}
664
		put_page(page);
665
	}
666
667
	blk_finish_plug(&plug);

668
	lru_add_drain();	/* Push any new pages onto the LRU now */
669
skip:
670
	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
671
}
672
673
674
675
676
677
678

int init_swap_address_space(unsigned int type, unsigned long nr_pages)
{
	struct address_space *spaces, *space;
	unsigned int i, nr;

	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
Kees Cook's avatar
Kees Cook committed
679
	spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
680
681
682
683
	if (!spaces)
		return -ENOMEM;
	for (i = 0; i < nr; i++) {
		space = spaces + i;
684
		xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
685
686
687
688
689
690
		atomic_set(&space->i_mmap_writable, 0);
		space->a_ops = &swap_aops;
		/* swap cache doesn't use writeback related tags */
		mapping_set_no_writeback_tags(space);
	}
	nr_swapper_spaces[type] = nr;
691
	swapper_spaces[type] = spaces;
692
693
694
695
696
697

	return 0;
}

void exit_swap_address_space(unsigned int type)
{
698
	kvfree(swapper_spaces[type]);
699
	nr_swapper_spaces[type] = 0;
700
	swapper_spaces[type] = NULL;
701
}
702
703
704
705
706
707
708
709
710
711
712
713
714
715

static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
				     unsigned long faddr,
				     unsigned long lpfn,
				     unsigned long rpfn,
				     unsigned long *start,
				     unsigned long *end)
{
	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
		      PFN_DOWN(faddr & PMD_MASK));
	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
}

716
717
static void swap_ra_info(struct vm_fault *vmf,
			struct vma_swap_readahead *ra_info)
718
719
{
	struct vm_area_struct *vma = vmf->vma;
720
	unsigned long ra_val;
721
722
723
	swp_entry_t entry;
	unsigned long faddr, pfn, fpfn;
	unsigned long start, end;
724
	pte_t *pte, *orig_pte;
725
726
727
728
729
	unsigned int max_win, hits, prev_win, win, left;
#ifndef CONFIG_64BIT
	pte_t *tpte;
#endif

730
731
732
	max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
			     SWAP_RA_ORDER_CEILING);
	if (max_win == 1) {
733
734
		ra_info->win = 1;
		return;
735
736
	}

737
	faddr = vmf->address;
738
739
740
741
742
743
	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
	entry = pte_to_swp_entry(*pte);
	if ((unlikely(non_swap_entry(entry)))) {
		pte_unmap(orig_pte);
		return;
	}
744
745

	fpfn = PFN_DOWN(faddr);
746
747
748
749
750
	ra_val = GET_SWAP_RA_VAL(vma);
	pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
	prev_win = SWAP_RA_WIN(ra_val);
	hits = SWAP_RA_HITS(ra_val);
	ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
751
752
753
754
					       max_win, prev_win);
	atomic_long_set(&vma->swap_readahead_info,
			SWAP_RA_VAL(faddr, win, 0));

755
756
757
758
	if (win == 1) {
		pte_unmap(orig_pte);
		return;
	}
759
760
761
762
763
764
765
766
767
768
769
770

	/* Copy the PTEs because the page table may be unmapped */
	if (fpfn == pfn + 1)
		swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
	else if (pfn == fpfn + 1)
		swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
				  &start, &end);
	else {
		left = (win - 1) / 2;
		swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
				  &start, &end);
	}
771
772
773
	ra_info->nr_pte = end - start;
	ra_info->offset = fpfn - start;
	pte -= ra_info->offset;
774
#ifdef CONFIG_64BIT
775
	ra_info->ptes = pte;
776
#else
777
	tpte = ra_info->ptes;
778
779
780
	for (pfn = start; pfn != end; pfn++)
		*tpte++ = *pte++;
#endif
781
	pte_unmap(orig_pte);
782
783
}

784
785
/**
 * swap_vma_readahead - swap in pages in hope we need them soon
786
 * @fentry: swap entry of this memory
787
788
789
790
791
792
793
794
 * @gfp_mask: memory allocation flags
 * @vmf: fault information
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read in a few pages whoes
 * virtual addresses are around the fault address in the same vma.
 *
795
 * Caller must hold read mmap_lock if vmf->vma is not NULL.
796
797
 *
 */
798
799
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
				       struct vm_fault *vmf)
800
801
802
803
804
805
806
807
{
	struct blk_plug plug;
	struct vm_area_struct *vma = vmf->vma;
	struct page *page;
	pte_t *pte, pentry;
	swp_entry_t entry;
	unsigned int i;
	bool page_allocated;
808
	struct vma_swap_readahead ra_info = {0,};
809

810
811
	swap_ra_info(vmf, &ra_info);
	if (ra_info.win == 1)
812
813
814
		goto skip;

	blk_start_plug(&plug);
815
	for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
	     i++, pte++) {
		pentry = *pte;
		if (pte_none(pentry))
			continue;
		if (pte_present(pentry))
			continue;
		entry = pte_to_swp_entry(pentry);
		if (unlikely(non_swap_entry(entry)))
			continue;
		page = __read_swap_cache_async(entry, gfp_mask, vma,
					       vmf->address, &page_allocated);
		if (!page)
			continue;
		if (page_allocated) {
			swap_readpage(page, false);
831
			if (i != ra_info.offset) {
832
833
834
835
836
837
838
839
840
841
				SetPageReadahead(page);
				count_vm_event(SWAP_RA);
			}
		}
		put_page(page);
	}
	blk_finish_plug(&plug);
	lru_add_drain();
skip:
	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
842
				     ra_info.win == 1);
843
}
844

845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @vmf: fault information
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * It's a main entry function for swap readahead. By the configuration,
 * it will read ahead blocks by cluster-based(ie, physical disk based)
 * or vma-based(ie, virtual address based on faulty address) readahead.
 */
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
				struct vm_fault *vmf)
{
	return swap_use_vma_readahead() ?
			swap_vma_readahead(entry, gfp_mask, vmf) :
			swap_cluster_readahead(entry, gfp_mask, vmf);
}

865
866
867
868
#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
				     struct kobj_attribute *attr, char *buf)
{
869
	return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
870
871
872
873
874
875
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
876
		enable_vma_readahead = true;
877
	else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
878
		enable_vma_readahead = false;
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
	else
		return -EINVAL;

	return count;
}
static struct kobj_attribute vma_ra_enabled_attr =
	__ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
	       vma_ra_enabled_store);

static struct attribute *swap_attrs[] = {
	&vma_ra_enabled_attr.attr,
	NULL,
};

static struct attribute_group swap_attr_group = {
	.attrs = swap_attrs,
};

static int __init swap_init_sysfs(void)
{
	int err;
	struct kobject *swap_kobj;

	swap_kobj = kobject_create_and_add("swap", mm_kobj);
	if (!swap_kobj) {
		pr_err("failed to create swap kobject\n");
		return -ENOMEM;
	}
	err = sysfs_create_group(swap_kobj, &swap_attr_group);
	if (err) {
		pr_err("failed to register swap group\n");
		goto delete_obj;
	}
	return 0;

delete_obj:
	kobject_put(swap_kobj);
	return err;
}
subsys_initcall(swap_init_sysfs);
#endif