shmem.c 105 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *		 2000 Transmeta Corp.
 *		 2000-2001 Christoph Rohland
 *		 2000-2001 SAP AG
 *		 2002 Red Hat Inc.
9
10
 * Copyright (C) 2002-2011 Hugh Dickins.
 * Copyright (C) 2011 Google Inc.
11
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
Linus Torvalds's avatar
Linus Torvalds committed
12
13
14
15
16
17
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
18
19
20
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
Linus Torvalds's avatar
Linus Torvalds committed
21
22
23
 * This file is released under the GPL.
 */

24
25
26
27
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
Andrew Morton's avatar
Andrew Morton committed
28
#include <linux/ramfs.h>
Hugh Dickins's avatar
Hugh Dickins committed
29
#include <linux/pagemap.h>
30
31
#include <linux/file.h>
#include <linux/mm.h>
32
#include <linux/sched/signal.h>
33
#include <linux/export.h>
34
#include <linux/swap.h>
35
#include <linux/uio.h>
36
#include <linux/khugepaged.h>
37
#include <linux/hugetlb.h>
38

39
40
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */

41
42
43
static struct vfsmount *shm_mnt;

#ifdef CONFIG_SHMEM
Linus Torvalds's avatar
Linus Torvalds committed
44
45
46
47
48
49
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

50
#include <linux/xattr.h>
51
#include <linux/exportfs.h>
52
#include <linux/posix_acl.h>
Christoph Hellwig's avatar
Christoph Hellwig committed
53
#include <linux/posix_acl_xattr.h>
Linus Torvalds's avatar
Linus Torvalds committed
54
55
56
57
58
59
60
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
61
#include <linux/pagevec.h>
62
#include <linux/percpu_counter.h>
63
#include <linux/falloc.h>
64
#include <linux/splice.h>
Linus Torvalds's avatar
Linus Torvalds committed
65
66
67
68
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
69
#include <linux/ctype.h>
70
#include <linux/migrate.h>
71
#include <linux/highmem.h>
72
#include <linux/seq_file.h>
Mimi Zohar's avatar
Mimi Zohar committed
73
#include <linux/magic.h>
74
#include <linux/syscalls.h>
David Herrmann's avatar
David Herrmann committed
75
#include <linux/fcntl.h>
76
#include <uapi/linux/memfd.h>
77
#include <linux/userfaultfd_k.h>
78
#include <linux/rmap.h>
79
#include <linux/uuid.h>
80

81
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
82
83
#include <asm/pgtable.h>

84
85
#include "internal.h"

86
87
#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
Linus Torvalds's avatar
Linus Torvalds committed
88
89
90
91

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

92
93
94
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128

95
/*
96
97
98
 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
 * inode->i_private (with i_mutex making sure that it has only one user at
 * a time): we would prefer not to enlarge the shmem inode just for that.
99
100
 */
struct shmem_falloc {
101
	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
102
103
104
105
106
107
	pgoff_t start;		/* start of range currently being fallocated */
	pgoff_t next;		/* the next page offset to be fallocated */
	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
};

Andrew Morton's avatar
Andrew Morton committed
108
#ifdef CONFIG_TMPFS
109
110
111
112
113
114
115
116
117
static unsigned long shmem_default_max_blocks(void)
{
	return totalram_pages / 2;
}

static unsigned long shmem_default_max_inodes(void)
{
	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
}
Andrew Morton's avatar
Andrew Morton committed
118
#endif
119

120
121
122
static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				struct shmem_inode_info *info, pgoff_t index);
123
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
124
		struct page **pagep, enum sgp_type sgp,
125
126
		gfp_t gfp, struct vm_area_struct *vma,
		struct vm_fault *vmf, int *fault_type);
127

128
int shmem_getpage(struct inode *inode, pgoff_t index,
129
		struct page **pagep, enum sgp_type sgp)
130
131
{
	return shmem_getpage_gfp(inode, index, pagep, sgp,
132
		mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
133
}
Linus Torvalds's avatar
Linus Torvalds committed
134
135
136
137
138
139
140
141
142
143
144
145
146
147

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
	return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
148
	return (flags & VM_NORESERVE) ?
149
		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
Linus Torvalds's avatar
Linus Torvalds committed
150
151
152
153
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
154
	if (!(flags & VM_NORESERVE))
Linus Torvalds's avatar
Linus Torvalds committed
155
156
157
		vm_unacct_memory(VM_ACCT(size));
}

158
159
160
161
162
163
164
165
166
167
168
169
170
static inline int shmem_reacct_size(unsigned long flags,
		loff_t oldsize, loff_t newsize)
{
	if (!(flags & VM_NORESERVE)) {
		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
			return security_vm_enough_memory_mm(current->mm,
					VM_ACCT(newsize) - VM_ACCT(oldsize));
		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
	}
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
171
172
/*
 * ... whereas tmpfs objects are accounted incrementally as
173
 * pages are allocated, in order to allow large sparse files.
Linus Torvalds's avatar
Linus Torvalds committed
174
175
176
 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
177
static inline int shmem_acct_block(unsigned long flags, long pages)
Linus Torvalds's avatar
Linus Torvalds committed
178
{
179
180
181
182
183
	if (!(flags & VM_NORESERVE))
		return 0;

	return security_vm_enough_memory_mm(current->mm,
			pages * VM_ACCT(PAGE_SIZE));
Linus Torvalds's avatar
Linus Torvalds committed
184
185
186
187
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
188
	if (flags & VM_NORESERVE)
189
		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
Linus Torvalds's avatar
Linus Torvalds committed
190
191
}

192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

	if (shmem_acct_block(info->flags, pages))
		return false;

	if (sbinfo->max_blocks) {
		if (percpu_counter_compare(&sbinfo->used_blocks,
					   sbinfo->max_blocks - pages) > 0)
			goto unacct;
		percpu_counter_add(&sbinfo->used_blocks, pages);
	}

	return true;

unacct:
	shmem_unacct_blocks(info->flags, pages);
	return false;
}

static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

	if (sbinfo->max_blocks)
		percpu_counter_sub(&sbinfo->used_blocks, pages);
	shmem_unacct_blocks(info->flags, pages);
}

224
static const struct super_operations shmem_ops;
225
static const struct address_space_operations shmem_aops;
226
static const struct file_operations shmem_file_operations;
227
228
229
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
230
static const struct vm_operations_struct shmem_vm_ops;
231
static struct file_system_type shmem_fs_type;
Linus Torvalds's avatar
Linus Torvalds committed
232

233
234
235
236
237
bool vma_is_shmem(struct vm_area_struct *vma)
{
	return vma->vm_ops == &shmem_vm_ops;
}

Linus Torvalds's avatar
Linus Torvalds committed
238
static LIST_HEAD(shmem_swaplist);
239
static DEFINE_MUTEX(shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
240

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
static int shmem_reserve_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		if (!sbinfo->free_inodes) {
			spin_unlock(&sbinfo->stat_lock);
			return -ENOSPC;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}
	return 0;
}

static void shmem_free_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}

266
/**
267
 * shmem_recalc_inode - recalculate the block usage of an inode
Linus Torvalds's avatar
Linus Torvalds committed
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * It has to be called with the spinlock held.
 */
static void shmem_recalc_inode(struct inode *inode)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	long freed;

	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
	if (freed > 0) {
		info->alloced -= freed;
286
		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
287
		shmem_inode_unacct_blocks(inode, freed);
Linus Torvalds's avatar
Linus Torvalds committed
288
289
290
	}
}

291
292
293
bool shmem_charge(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
294
	unsigned long flags;
295

296
	if (!shmem_inode_acct_block(inode, pages))
297
		return false;
298

299
	spin_lock_irqsave(&info->lock, flags);
300
301
302
	info->alloced += pages;
	inode->i_blocks += pages * BLOCKS_PER_PAGE;
	shmem_recalc_inode(inode);
303
	spin_unlock_irqrestore(&info->lock, flags);
304
305
306
307
308
309
310
311
	inode->i_mapping->nrpages += pages;

	return true;
}

void shmem_uncharge(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
312
	unsigned long flags;
313

314
	spin_lock_irqsave(&info->lock, flags);
315
316
317
	info->alloced -= pages;
	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
	shmem_recalc_inode(inode);
318
	spin_unlock_irqrestore(&info->lock, flags);
319

320
	shmem_inode_unacct_blocks(inode, pages);
321
322
}

323
324
325
326
327
328
/*
 * Replace item expected in radix tree by a new item, while holding tree lock.
 */
static int shmem_radix_tree_replace(struct address_space *mapping,
			pgoff_t index, void *expected, void *replacement)
{
329
	struct radix_tree_node *node;
330
	void __rcu **pslot;
331
	void *item;
332
333

	VM_BUG_ON(!expected);
334
	VM_BUG_ON(!replacement);
Matthew Wilcox's avatar
Matthew Wilcox committed
335
	item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
336
	if (!item)
337
		return -ENOENT;
338
339
	if (item != expected)
		return -ENOENT;
Matthew Wilcox's avatar
Matthew Wilcox committed
340
	__radix_tree_replace(&mapping->i_pages, node, pslot,
341
			     replacement, NULL);
342
343
344
	return 0;
}

345
346
347
348
349
350
351
352
353
354
355
356
357
/*
 * Sometimes, before we decide whether to proceed or to fail, we must check
 * that an entry was not already brought back from swap by a racing thread.
 *
 * Checking page is not enough: by the time a SwapCache page is locked, it
 * might be reused, and again be SwapCache, using the same swap as before.
 */
static bool shmem_confirm_swap(struct address_space *mapping,
			       pgoff_t index, swp_entry_t swap)
{
	void *item;

	rcu_read_lock();
Matthew Wilcox's avatar
Matthew Wilcox committed
358
	item = radix_tree_lookup(&mapping->i_pages, index);
359
360
361
362
	rcu_read_unlock();
	return item == swp_to_radix_entry(swap);
}

363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/*
 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 *
 * SHMEM_HUGE_NEVER:
 *	disables huge pages for the mount;
 * SHMEM_HUGE_ALWAYS:
 *	enables huge pages for the mount;
 * SHMEM_HUGE_WITHIN_SIZE:
 *	only allocate huge pages if the page will be fully within i_size,
 *	also respect fadvise()/madvise() hints;
 * SHMEM_HUGE_ADVISE:
 *	only allocate huge pages if requested with fadvise()/madvise();
 */

#define SHMEM_HUGE_NEVER	0
#define SHMEM_HUGE_ALWAYS	1
#define SHMEM_HUGE_WITHIN_SIZE	2
#define SHMEM_HUGE_ADVISE	3

/*
 * Special values.
 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 *
 * SHMEM_HUGE_DENY:
 *	disables huge on shm_mnt and all mounts, for emergency use;
 * SHMEM_HUGE_FORCE:
 *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 *
 */
#define SHMEM_HUGE_DENY		(-1)
#define SHMEM_HUGE_FORCE	(-2)

395
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
396
397
/* ifdef here to avoid bloating shmem.o when not necessary */

398
static int shmem_huge __read_mostly;
399

400
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
static int shmem_parse_huge(const char *str)
{
	if (!strcmp(str, "never"))
		return SHMEM_HUGE_NEVER;
	if (!strcmp(str, "always"))
		return SHMEM_HUGE_ALWAYS;
	if (!strcmp(str, "within_size"))
		return SHMEM_HUGE_WITHIN_SIZE;
	if (!strcmp(str, "advise"))
		return SHMEM_HUGE_ADVISE;
	if (!strcmp(str, "deny"))
		return SHMEM_HUGE_DENY;
	if (!strcmp(str, "force"))
		return SHMEM_HUGE_FORCE;
	return -EINVAL;
}

static const char *shmem_format_huge(int huge)
{
	switch (huge) {
	case SHMEM_HUGE_NEVER:
		return "never";
	case SHMEM_HUGE_ALWAYS:
		return "always";
	case SHMEM_HUGE_WITHIN_SIZE:
		return "within_size";
	case SHMEM_HUGE_ADVISE:
		return "advise";
	case SHMEM_HUGE_DENY:
		return "deny";
	case SHMEM_HUGE_FORCE:
		return "force";
	default:
		VM_BUG_ON(1);
		return "bad_val";
	}
}
438
#endif
439

440
441
442
443
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
		struct shrink_control *sc, unsigned long nr_to_split)
{
	LIST_HEAD(list), *pos, *next;
444
	LIST_HEAD(to_remove);
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
	struct inode *inode;
	struct shmem_inode_info *info;
	struct page *page;
	unsigned long batch = sc ? sc->nr_to_scan : 128;
	int removed = 0, split = 0;

	if (list_empty(&sbinfo->shrinklist))
		return SHRINK_STOP;

	spin_lock(&sbinfo->shrinklist_lock);
	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
		info = list_entry(pos, struct shmem_inode_info, shrinklist);

		/* pin the inode */
		inode = igrab(&info->vfs_inode);

		/* inode is about to be evicted */
		if (!inode) {
			list_del_init(&info->shrinklist);
			removed++;
			goto next;
		}

		/* Check if there's anything to gain */
		if (round_up(inode->i_size, PAGE_SIZE) ==
				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
471
			list_move(&info->shrinklist, &to_remove);
472
473
474
475
476
477
478
479
480
481
482
			removed++;
			goto next;
		}

		list_move(&info->shrinklist, &list);
next:
		if (!--batch)
			break;
	}
	spin_unlock(&sbinfo->shrinklist_lock);

483
484
485
486
487
488
489
	list_for_each_safe(pos, next, &to_remove) {
		info = list_entry(pos, struct shmem_inode_info, shrinklist);
		inode = &info->vfs_inode;
		list_del_init(&info->shrinklist);
		iput(inode);
	}

490
491
492
493
494
495
	list_for_each_safe(pos, next, &list) {
		int ret;

		info = list_entry(pos, struct shmem_inode_info, shrinklist);
		inode = &info->vfs_inode;

496
497
		if (nr_to_split && split >= nr_to_split)
			goto leave;
498

499
		page = find_get_page(inode->i_mapping,
500
501
502
503
				(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
		if (!page)
			goto drop;

504
		/* No huge page at the end of the file: nothing to split */
505
506
507
508
509
		if (!PageTransHuge(page)) {
			put_page(page);
			goto drop;
		}

510
511
512
513
514
515
516
517
518
519
520
521
		/*
		 * Leave the inode on the list if we failed to lock
		 * the page at this time.
		 *
		 * Waiting for the lock may lead to deadlock in the
		 * reclaim path.
		 */
		if (!trylock_page(page)) {
			put_page(page);
			goto leave;
		}

522
523
524
525
		ret = split_huge_page(page);
		unlock_page(page);
		put_page(page);

526
527
528
		/* If split failed leave the inode on the list */
		if (ret)
			goto leave;
529
530
531
532
533

		split++;
drop:
		list_del_init(&info->shrinklist);
		removed++;
534
leave:
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
		iput(inode);
	}

	spin_lock(&sbinfo->shrinklist_lock);
	list_splice_tail(&list, &sbinfo->shrinklist);
	sbinfo->shrinklist_len -= removed;
	spin_unlock(&sbinfo->shrinklist_lock);

	return split;
}

static long shmem_unused_huge_scan(struct super_block *sb,
		struct shrink_control *sc)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

	if (!READ_ONCE(sbinfo->shrinklist_len))
		return SHRINK_STOP;

	return shmem_unused_huge_shrink(sbinfo, sc, 0);
}

static long shmem_unused_huge_count(struct super_block *sb,
		struct shrink_control *sc)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	return READ_ONCE(sbinfo->shrinklist_len);
}
563
#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
564
565
566

#define shmem_huge SHMEM_HUGE_DENY

567
568
569
570
571
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
		struct shrink_control *sc, unsigned long nr_to_split)
{
	return 0;
}
572
#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
573

574
575
576
577
578
579
580
581
582
static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
{
	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	    (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
	    shmem_huge != SHMEM_HUGE_DENY)
		return true;
	return false;
}

583
584
585
586
587
/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
static int shmem_add_to_page_cache(struct page *page,
				   struct address_space *mapping,
588
				   pgoff_t index, void *expected)
589
{
590
	int error, nr = hpage_nr_pages(page);
591

592
593
	VM_BUG_ON_PAGE(PageTail(page), page);
	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
594
595
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
596
	VM_BUG_ON(expected && PageTransHuge(page));
597

598
	page_ref_add(page, nr);
599
600
601
	page->mapping = mapping;
	page->index = index;

Matthew Wilcox's avatar
Matthew Wilcox committed
602
	xa_lock_irq(&mapping->i_pages);
603
604
605
606
607
608
	if (PageTransHuge(page)) {
		void __rcu **results;
		pgoff_t idx;
		int i;

		error = 0;
Matthew Wilcox's avatar
Matthew Wilcox committed
609
		if (radix_tree_gang_lookup_slot(&mapping->i_pages,
610
611
612
613
614
615
616
					&results, &idx, index, 1) &&
				idx < index + HPAGE_PMD_NR) {
			error = -EEXIST;
		}

		if (!error) {
			for (i = 0; i < HPAGE_PMD_NR; i++) {
Matthew Wilcox's avatar
Matthew Wilcox committed
617
				error = radix_tree_insert(&mapping->i_pages,
618
619
620
621
622
623
						index + i, page + i);
				VM_BUG_ON(error);
			}
			count_vm_event(THP_FILE_ALLOC);
		}
	} else if (!expected) {
Matthew Wilcox's avatar
Matthew Wilcox committed
624
		error = radix_tree_insert(&mapping->i_pages, index, page);
625
	} else {
626
627
		error = shmem_radix_tree_replace(mapping, index, expected,
								 page);
628
629
	}

630
	if (!error) {
631
632
		mapping->nrpages += nr;
		if (PageTransHuge(page))
633
634
635
			__inc_node_page_state(page, NR_SHMEM_THPS);
		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
		__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
Matthew Wilcox's avatar
Matthew Wilcox committed
636
		xa_unlock_irq(&mapping->i_pages);
637
638
	} else {
		page->mapping = NULL;
Matthew Wilcox's avatar
Matthew Wilcox committed
639
		xa_unlock_irq(&mapping->i_pages);
640
		page_ref_sub(page, nr);
641
642
643
644
	}
	return error;
}

645
646
647
648
649
650
651
652
/*
 * Like delete_from_page_cache, but substitutes swap for page.
 */
static void shmem_delete_from_page_cache(struct page *page, void *radswap)
{
	struct address_space *mapping = page->mapping;
	int error;

653
654
	VM_BUG_ON_PAGE(PageCompound(page), page);

Matthew Wilcox's avatar
Matthew Wilcox committed
655
	xa_lock_irq(&mapping->i_pages);
656
657
658
	error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
	page->mapping = NULL;
	mapping->nrpages--;
659
660
	__dec_node_page_state(page, NR_FILE_PAGES);
	__dec_node_page_state(page, NR_SHMEM);
Matthew Wilcox's avatar
Matthew Wilcox committed
661
	xa_unlock_irq(&mapping->i_pages);
662
	put_page(page);
663
664
665
	BUG_ON(error);
}

666
667
668
669
670
671
/*
 * Remove swap entry from radix tree, free the swap and its page cache.
 */
static int shmem_free_swap(struct address_space *mapping,
			   pgoff_t index, void *radswap)
{
672
	void *old;
673

Matthew Wilcox's avatar
Matthew Wilcox committed
674
675
676
	xa_lock_irq(&mapping->i_pages);
	old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
	xa_unlock_irq(&mapping->i_pages);
677
678
679
680
	if (old != radswap)
		return -ENOENT;
	free_swap_and_cache(radix_to_swp_entry(radswap));
	return 0;
681
682
}

683
684
/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
685
 * given offsets are swapped out.
686
 *
Matthew Wilcox's avatar
Matthew Wilcox committed
687
 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
688
689
 * as long as the inode doesn't go away and racy results are not a problem.
 */
690
691
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
						pgoff_t start, pgoff_t end)
692
693
{
	struct radix_tree_iter iter;
694
	void __rcu **slot;
695
	struct page *page;
696
	unsigned long swapped = 0;
697
698
699

	rcu_read_lock();

Matthew Wilcox's avatar
Matthew Wilcox committed
700
	radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
701
702
703
704
705
		if (iter.index >= end)
			break;

		page = radix_tree_deref_slot(slot);

706
707
708
709
		if (radix_tree_deref_retry(page)) {
			slot = radix_tree_iter_retry(&iter);
			continue;
		}
710
711
712
713
714

		if (radix_tree_exceptional_entry(page))
			swapped++;

		if (need_resched()) {
715
			slot = radix_tree_iter_resume(slot, &iter);
716
717
718
719
720
721
722
723
724
			cond_resched_rcu();
		}
	}

	rcu_read_unlock();

	return swapped << PAGE_SHIFT;
}

725
726
727
728
/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given vma is swapped out.
 *
Matthew Wilcox's avatar
Matthew Wilcox committed
729
 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
	struct inode *inode = file_inode(vma->vm_file);
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct address_space *mapping = inode->i_mapping;
	unsigned long swapped;

	/* Be careful as we don't hold info->lock */
	swapped = READ_ONCE(info->swapped);

	/*
	 * The easier cases are when the shmem object has nothing in swap, or
	 * the vma maps it whole. Then we can simply use the stats that we
	 * already track.
	 */
	if (!swapped)
		return 0;

	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
		return swapped << PAGE_SHIFT;

	/* Here comes the more involved part */
	return shmem_partial_swap_usage(mapping,
			linear_page_index(vma, vma->vm_start),
			linear_page_index(vma, vma->vm_end));
}

759
760
761
762
763
764
765
766
767
/*
 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 */
void shmem_unlock_mapping(struct address_space *mapping)
{
	struct pagevec pvec;
	pgoff_t indices[PAGEVEC_SIZE];
	pgoff_t index = 0;

768
	pagevec_init(&pvec);
769
770
771
772
773
774
775
776
	/*
	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
	 */
	while (!mapping_unevictable(mapping)) {
		/*
		 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
		 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
		 */
777
778
		pvec.nr = find_get_entries(mapping, index,
					   PAGEVEC_SIZE, pvec.pages, indices);
779
780
781
		if (!pvec.nr)
			break;
		index = indices[pvec.nr - 1] + 1;
782
		pagevec_remove_exceptionals(&pvec);
783
784
785
786
		check_move_unevictable_pages(pvec.pages, pvec.nr);
		pagevec_release(&pvec);
		cond_resched();
	}
787
788
789
790
}

/*
 * Remove range of pages and swap entries from radix tree, and free them.
791
 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
792
 */
793
794
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
								 bool unfalloc)
Linus Torvalds's avatar
Linus Torvalds committed
795
{
796
	struct address_space *mapping = inode->i_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
797
	struct shmem_inode_info *info = SHMEM_I(inode);
798
799
800
801
	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
802
	struct pagevec pvec;
803
804
	pgoff_t indices[PAGEVEC_SIZE];
	long nr_swaps_freed = 0;
805
	pgoff_t index;
806
807
	int i;

808
809
	if (lend == -1)
		end = -1;	/* unsigned, so actually very big */
810

811
	pagevec_init(&pvec);
812
	index = start;
813
	while (index < end) {
814
815
816
		pvec.nr = find_get_entries(mapping, index,
			min(end - index, (pgoff_t)PAGEVEC_SIZE),
			pvec.pages, indices);
817
818
		if (!pvec.nr)
			break;
819
820
821
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

822
			index = indices[i];
823
			if (index >= end)
824
825
				break;

826
			if (radix_tree_exceptional_entry(page)) {
827
828
				if (unfalloc)
					continue;
829
830
				nr_swaps_freed += !shmem_free_swap(mapping,
								index, page);
831
				continue;
832
833
			}

834
835
			VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);

836
			if (!trylock_page(page))
837
				continue;
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857

			if (PageTransTail(page)) {
				/* Middle of THP: zero out the page */
				clear_highpage(page);
				unlock_page(page);
				continue;
			} else if (PageTransHuge(page)) {
				if (index == round_down(end, HPAGE_PMD_NR)) {
					/*
					 * Range ends in the middle of THP:
					 * zero out the page
					 */
					clear_highpage(page);
					unlock_page(page);
					continue;
				}
				index += HPAGE_PMD_NR - 1;
				i += HPAGE_PMD_NR - 1;
			}

858
			if (!unfalloc || !PageUptodate(page)) {
859
860
				VM_BUG_ON_PAGE(PageTail(page), page);
				if (page_mapping(page) == mapping) {
861
					VM_BUG_ON_PAGE(PageWriteback(page), page);
862
863
					truncate_inode_page(mapping, page);
				}
864
865
866
			}
			unlock_page(page);
		}
867
		pagevec_remove_exceptionals(&pvec);
868
		pagevec_release(&pvec);
869
870
871
		cond_resched();
		index++;
	}
Linus Torvalds's avatar
Linus Torvalds committed
872

873
	if (partial_start) {
874
		struct page *page = NULL;
875
		shmem_getpage(inode, start - 1, &page, SGP_READ);
876
		if (page) {
877
			unsigned int top = PAGE_SIZE;
878
879
880
881
882
883
884
			if (start > end) {
				top = partial_end;
				partial_end = 0;
			}
			zero_user_segment(page, partial_start, top);
			set_page_dirty(page);
			unlock_page(page);
885
			put_page(page);
886
887
888
889
		}
	}
	if (partial_end) {
		struct page *page = NULL;
890
		shmem_getpage(inode, end, &page, SGP_READ);
891
892
		if (page) {
			zero_user_segment(page, 0, partial_end);
893
894
			set_page_dirty(page);
			unlock_page(page);
895
			put_page(page);
896
897
		}
	}
898
899
	if (start >= end)
		return;
900
901

	index = start;
902
	while (index < end) {
903
		cond_resched();
904
905

		pvec.nr = find_get_entries(mapping, index,
906
				min(end - index, (pgoff_t)PAGEVEC_SIZE),
907
				pvec.pages, indices);
908
		if (!pvec.nr) {
909
910
			/* If all gone or hole-punch or unfalloc, we're done */
			if (index == start || end != -1)
911
				break;
912
			/* But if truncating, restart to make sure all gone */
913
914
915
916
917
918
			index = start;
			continue;
		}
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

919
			index = indices[i];
920
			if (index >= end)
921
922
				break;

923
			if (radix_tree_exceptional_entry(page)) {
924
925
				if (unfalloc)
					continue;
926
927
928
929
930
931
				if (shmem_free_swap(mapping, index, page)) {
					/* Swap was replaced by page: retry */
					index--;
					break;
				}
				nr_swaps_freed++;
932
933
934
				continue;
			}

935
			lock_page(page);
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962

			if (PageTransTail(page)) {
				/* Middle of THP: zero out the page */
				clear_highpage(page);
				unlock_page(page);
				/*
				 * Partial thp truncate due 'start' in middle
				 * of THP: don't need to look on these pages
				 * again on !pvec.nr restart.
				 */
				if (index != round_down(end, HPAGE_PMD_NR))
					start++;
				continue;
			} else if (PageTransHuge(page)) {
				if (index == round_down(end, HPAGE_PMD_NR)) {
					/*
					 * Range ends in the middle of THP:
					 * zero out the page
					 */
					clear_highpage(page);
					unlock_page(page);
					continue;
				}
				index += HPAGE_PMD_NR - 1;
				i += HPAGE_PMD_NR - 1;
			}

963
			if (!unfalloc || !PageUptodate(page)) {
964
965
				VM_BUG_ON_PAGE(PageTail(page), page);
				if (page_mapping(page) == mapping) {
966
					VM_BUG_ON_PAGE(PageWriteback(page), page);
967
					truncate_inode_page(mapping, page);
968
969
970
971
972
				} else {
					/* Page was replaced by swap: retry */
					unlock_page(page);
					index--;
					break;
973
				}
974
			}
975
976
			unlock_page(page);
		}
977
		pagevec_remove_exceptionals(&pvec);
978
		pagevec_release(&pvec);
979
980
		index++;
	}
981

982
	spin_lock_irq(&info->lock);
983
	info->swapped -= nr_swaps_freed;
Linus Torvalds's avatar
Linus Torvalds committed
984
	shmem_recalc_inode(inode);
985
	spin_unlock_irq(&info->lock);
986
}
Linus Torvalds's avatar
Linus Torvalds committed
987

988
989
990
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
	shmem_undo_range(inode, lstart, lend, false);
991
	inode->i_ctime = inode->i_mtime = current_time(inode);
Linus Torvalds's avatar
Linus Torvalds committed
992
}
993
EXPORT_SYMBOL_GPL(shmem_truncate_range);
Linus Torvalds's avatar
Linus Torvalds committed
994

995
996
static int shmem_getattr(const struct path *path, struct kstat *stat,
			 u32 request_mask, unsigned int query_flags)
997
{
998
	struct inode *inode = path->dentry->d_inode;
999
	struct shmem_inode_info *info = SHMEM_I(inode);
1000
	struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
1001

1002
	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
1003
		spin_lock_irq(&info->lock);
1004
		shmem_recalc_inode(inode);
1005
		spin_unlock_irq(&info->lock);
1006
	}
1007
	generic_fillattr(inode, stat);
1008
1009
1010
1011

	if (is_huge_enabled(sb_info))
		stat->blksize = HPAGE_PMD_SIZE;

1012
1013
1014
	return 0;
}

1015
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
Linus Torvalds's avatar
Linus Torvalds committed
1016
{
1017
	struct inode *inode = d_inode(dentry);
David Herrmann's avatar
David Herrmann committed
1018
	struct shmem_inode_info *info = SHMEM_I(inode);
1019
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
Linus Torvalds's avatar
Linus Torvalds committed
1020
1021
	int error;

1022
	error = setattr_prepare(dentry, attr);
1023
1024
1025
	if (error)
		return error;

1026
1027
1028
	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
		loff_t oldsize = inode->i_size;
		loff_t newsize = attr->ia_size;
1029

David Herrmann's avatar
David Herrmann committed
1030
1031
1032
1033
1034
		/* protected by i_mutex */
		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
			return -EPERM;

1035
		if (newsize != oldsize) {
1036
1037
1038
1039
			error = shmem_reacct_size(SHMEM_I(inode)->flags,
					oldsize, newsize);
			if (error)
				return error;
1040
			i_size_write(inode, newsize);
1041
			inode->i_ctime = inode->i_mtime = current_time(inode);
1042
		}
1043
		if (newsize <= oldsize) {
1044
			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1045
1046
1047
1048
1049
1050
			if (oldsize > holebegin)
				unmap_mapping_range(inode->i_mapping,
							holebegin, 0, 1);
			if (info->alloced)
				shmem_truncate_range(inode,
							newsize, (loff_t)-1);
1051
			/* unmap again to remove racily COWed private pages */
1052
1053
1054
			if (oldsize > holebegin)
				unmap_mapping_range(inode->i_mapping,
							holebegin, 0, 1);
1055
1056
1057
1058
1059
1060
1061

			/*
			 * Part of the huge page can be beyond i_size: subject
			 * to shrink under memory pressure.
			 */
			if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
				spin_lock(&sbinfo->shrinklist_lock);
1062
1063
1064
1065
1066
				/*
				 * _careful to defend against unlocked access to
				 * ->shrink_list in shmem_unused_huge_shrink()
				 */
				if (list_empty_careful(&info->shrinklist)) {
1067
1068
1069
1070
1071
1072
					list_add_tail(&info->shrinklist,
							&sbinfo->shrinklist);
					sbinfo->shrinklist_len++;
				}
				spin_unlock(&sbinfo->shrinklist_lock);
			}
1073
		}
Linus Torvalds's avatar
Linus Torvalds committed
1074
1075
	}

1076
1077
	setattr_copy(inode, attr);
	if (attr->ia_valid & ATTR_MODE)
Christoph Hellwig's avatar
Christoph Hellwig committed
1078
		error = posix_acl_chmod(inode, inode->i_mode);
Linus Torvalds's avatar
Linus Torvalds committed
1079
1080
1081
	return error;
}

Al Viro's avatar
Al Viro committed
1082
static void shmem_evict_inode(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
1083
1084
{
	struct shmem_inode_info *info = SHMEM_I(inode);
1085
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
Linus Torvalds's avatar
Linus Torvalds committed
1086

1087
	if (inode->i_mapping->a_ops == &shmem_aops) {
Linus Torvalds's avatar
Linus Torvalds committed
1088
1089
		shmem_unacct_size(info->flags, inode->i_size);
		inode->i_size = 0;
1090
		shmem_truncate_range(inode, 0, (loff_t)-1);
1091
1092
1093
1094
1095
1096
1097
1098
		if (!list_empty(&info->shrinklist)) {
			spin_lock(&sbinfo->shrinklist_lock);
			if (!list_empty(&info->shrinklist)) {
				list_del_init(&info->shrinklist);
				sbinfo->shrinklist_len--;
			}
			spin_unlock(&sbinfo->shrinklist_lock);
		}
Linus Torvalds's avatar
Linus Torvalds committed
1099
		if (!list_empty(&info->swaplist)) {
1100
			mutex_lock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1101
			list_del_init(&info->swaplist);
1102
			mutex_unlock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1103
		}
1104
	}
1105

1106
	simple_xattrs_free(&info->xattrs);
1107
	WARN_ON(inode->i_blocks);
1108
	shmem_free_inode(inode->i_sb);
1109
	clear_inode(inode);
Linus Torvalds's avatar
Linus Torvalds committed
1110
1111
}

1112
1113
1114
static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
{
	struct radix_tree_iter iter;
1115
	void __rcu **slot;
1116
1117
1118
1119
1120
	unsigned long found = -1;
	unsigned int checked = 0;

	rcu_read_lock();
	radix_tree_for_each_slot(slot, root, &iter, 0) {
1121
1122
1123
1124
1125
1126
1127
		void *entry = radix_tree_deref_slot(slot);

		if (radix_tree_deref_retry(entry)) {
			slot = radix_tree_iter_retry(&iter);
			continue;
		}
		if (entry == item) {