inode.c 285 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Chris Mason's avatar
Chris Mason committed
2
3
4
5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

6
#include <linux/kernel.h>
7
#include <linux/bio.h>
Chris Mason's avatar
Chris Mason committed
8
#include <linux/buffer_head.h>
Sage Weil's avatar
Sage Weil committed
9
#include <linux/file.h>
Chris Mason's avatar
Chris Mason committed
10
11
12
13
14
15
16
17
18
19
20
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/compat.h>
Chris Mason's avatar
Chris Mason committed
21
#include <linux/bit_spinlock.h>
Josef Bacik's avatar
Josef Bacik committed
22
#include <linux/xattr.h>
Josef Bacik's avatar
Josef Bacik committed
23
#include <linux/posix_acl.h>
Yan Zheng's avatar
Yan Zheng committed
24
#include <linux/falloc.h>
25
#include <linux/slab.h>
26
#include <linux/ratelimit.h>
27
#include <linux/mount.h>
28
#include <linux/btrfs.h>
David Woodhouse's avatar
David Woodhouse committed
29
#include <linux/blkdev.h>
30
#include <linux/posix_acl_xattr.h>
31
#include <linux/uio.h>
32
#include <linux/magic.h>
33
#include <linux/iversion.h>
34
#include <asm/unaligned.h>
Chris Mason's avatar
Chris Mason committed
35
36
37
38
39
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
40
#include "ordered-data.h"
41
#include "xattr.h"
42
#include "tree-log.h"
43
#include "volumes.h"
44
#include "compression.h"
45
#include "locking.h"
46
#include "free-space-cache.h"
47
#include "inode-map.h"
Liu Bo's avatar
Liu Bo committed
48
#include "backref.h"
49
#include "props.h"
50
#include "qgroup.h"
51
#include "dedupe.h"
Chris Mason's avatar
Chris Mason committed
52
53

struct btrfs_iget_args {
54
	struct btrfs_key *location;
Chris Mason's avatar
Chris Mason committed
55
56
57
	struct btrfs_root *root;
};

58
59
60
61
struct btrfs_dio_data {
	u64 reserve;
	u64 unsubmitted_oe_range_start;
	u64 unsubmitted_oe_range_end;
62
	int overwrite;
63
64
};

65
66
67
68
69
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
70
71
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
72
static const struct file_operations btrfs_dir_file_operations;
73
static const struct extent_io_ops btrfs_extent_io_ops;
Chris Mason's avatar
Chris Mason committed
74
75
76
77

static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_path_cachep;
78
struct kmem_cache *btrfs_free_space_cachep;
Chris Mason's avatar
Chris Mason committed
79
80

#define S_SHIFT 12
David Sterba's avatar
David Sterba committed
81
static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
Chris Mason's avatar
Chris Mason committed
82
83
84
85
86
87
88
89
90
	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
};

91
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
92
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
93
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
94
95
static noinline int cow_file_range(struct inode *inode,
				   struct page *locked_page,
96
97
98
				   u64 start, u64 end, u64 delalloc_end,
				   int *page_started, unsigned long *nr_written,
				   int unlock, struct btrfs_dedupe_hash *hash);
99
100
101
102
103
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
				       u64 orig_start, u64 block_start,
				       u64 block_len, u64 orig_block_len,
				       u64 ram_bytes, int compress_type,
				       int type);
104

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
static void __endio_write_update_ordered(struct inode *inode,
					 const u64 offset, const u64 bytes,
					 const bool uptodate);

/*
 * Cleanup all submitted ordered extents in specified range to handle errors
 * from the fill_dellaloc() callback.
 *
 * NOTE: caller must ensure that when an error happens, it can not call
 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 * to be released, which we want to happen only when finishing the ordered
 * extent (btrfs_finish_ordered_io()). Also note that the caller of the
 * fill_delalloc() callback already does proper cleanup for the first page of
 * the range, that is, it invokes the callback writepage_end_io_hook() for the
 * range of the first page.
 */
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
						 const u64 offset,
						 const u64 bytes)
{
126
127
128
129
130
131
132
133
134
135
136
137
	unsigned long index = offset >> PAGE_SHIFT;
	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
	struct page *page;

	while (index <= end_index) {
		page = find_get_page(inode->i_mapping, index);
		index++;
		if (!page)
			continue;
		ClearPagePrivate2(page);
		put_page(page);
	}
138
139
140
141
	return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
					    bytes - PAGE_SIZE, false);
}

142
static int btrfs_dirty_inode(struct inode *inode);
143

144
145
146
147
148
149
150
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_inode_set_ops(struct inode *inode)
{
	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
}
#endif

151
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
152
153
				     struct inode *inode,  struct inode *dir,
				     const struct qstr *qstr)
Jim Owens's avatar
Jim Owens committed
154
155
156
{
	int err;

157
	err = btrfs_init_acl(trans, inode, dir);
Jim Owens's avatar
Jim Owens committed
158
	if (!err)
159
		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
Jim Owens's avatar
Jim Owens committed
160
161
162
	return err;
}

163
164
165
166
167
/*
 * this does all the hard work for inserting an inline extent into
 * the btree.  The caller should have done a btrfs_drop_extents so that
 * no overlapping inline items exist in the btree
 */
168
static int insert_inline_extent(struct btrfs_trans_handle *trans,
169
				struct btrfs_path *path, int extent_inserted,
170
171
				struct btrfs_root *root, struct inode *inode,
				u64 start, size_t size, size_t compressed_size,
172
				int compress_type,
173
174
175
176
177
178
179
180
181
182
183
				struct page **compressed_pages)
{
	struct extent_buffer *leaf;
	struct page *page = NULL;
	char *kaddr;
	unsigned long ptr;
	struct btrfs_file_extent_item *ei;
	int ret;
	size_t cur_size = size;
	unsigned long offset;

184
	if (compressed_size && compressed_pages)
185
186
		cur_size = compressed_size;

187
	inode_add_bytes(inode, size);
188

189
190
191
	if (!extent_inserted) {
		struct btrfs_key key;
		size_t datasize;
192

193
		key.objectid = btrfs_ino(BTRFS_I(inode));
194
		key.offset = start;
195
		key.type = BTRFS_EXTENT_DATA_KEY;
196

197
198
199
200
		datasize = btrfs_file_extent_calc_inline_size(cur_size);
		path->leave_spinning = 1;
		ret = btrfs_insert_empty_item(trans, root, path, &key,
					      datasize);
201
		if (ret)
202
			goto fail;
203
204
205
206
207
208
209
210
211
212
213
	}
	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
	btrfs_set_file_extent_encryption(leaf, ei, 0);
	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
	ptr = btrfs_file_extent_inline_start(ei);

214
	if (compress_type != BTRFS_COMPRESS_NONE) {
215
216
		struct page *cpage;
		int i = 0;
217
		while (compressed_size > 0) {
218
			cpage = compressed_pages[i];
219
			cur_size = min_t(unsigned long, compressed_size,
220
				       PAGE_SIZE);
221

222
			kaddr = kmap_atomic(cpage);
223
			write_extent_buffer(leaf, kaddr, ptr, cur_size);
224
			kunmap_atomic(kaddr);
225
226
227
228
229
230

			i++;
			ptr += cur_size;
			compressed_size -= cur_size;
		}
		btrfs_set_file_extent_compression(leaf, ei,
231
						  compress_type);
232
233
	} else {
		page = find_get_page(inode->i_mapping,
234
				     start >> PAGE_SHIFT);
235
		btrfs_set_file_extent_compression(leaf, ei, 0);
236
		kaddr = kmap_atomic(page);
237
		offset = start & (PAGE_SIZE - 1);
238
		write_extent_buffer(leaf, kaddr + offset, ptr, size);
239
		kunmap_atomic(kaddr);
240
		put_page(page);
241
242
	}
	btrfs_mark_buffer_dirty(leaf);
243
	btrfs_release_path(path);
244

245
246
247
248
249
250
251
252
253
	/*
	 * we're an inline extent, so nobody can
	 * extend the file past i_size without locking
	 * a page we already have locked.
	 *
	 * We must do any isize and inode updates
	 * before we unlock the pages.  Otherwise we
	 * could end up racing with unlink.
	 */
254
	BTRFS_I(inode)->disk_i_size = inode->i_size;
255
	ret = btrfs_update_inode(trans, root, inode);
256

257
fail:
258
	return ret;
259
260
261
262
263
264
265
266
}


/*
 * conditionally insert an inline extent into the file.  This
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 */
267
static noinline int cow_file_range_inline(struct inode *inode, u64 start,
268
269
270
					  u64 end, size_t compressed_size,
					  int compress_type,
					  struct page **compressed_pages)
271
{
272
	struct btrfs_root *root = BTRFS_I(inode)->root;
273
	struct btrfs_fs_info *fs_info = root->fs_info;
274
	struct btrfs_trans_handle *trans;
275
276
277
	u64 isize = i_size_read(inode);
	u64 actual_end = min(end + 1, isize);
	u64 inline_len = actual_end - start;
278
	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
279
280
	u64 data_len = inline_len;
	int ret;
281
282
283
	struct btrfs_path *path;
	int extent_inserted = 0;
	u32 extent_item_size;
284
285
286
287
288

	if (compressed_size)
		data_len = compressed_size;

	if (start > 0 ||
289
290
	    actual_end > fs_info->sectorsize ||
	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
291
	    (!compressed_size &&
292
	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
293
	    end + 1 < isize ||
294
	    data_len > fs_info->max_inline) {
295
296
297
		return 1;
	}

298
299
300
301
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

302
	trans = btrfs_join_transaction(root);
303
304
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
305
		return PTR_ERR(trans);
306
	}
307
	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
308

309
310
311
312
313
314
315
316
317
318
	if (compressed_size && compressed_pages)
		extent_item_size = btrfs_file_extent_calc_inline_size(
		   compressed_size);
	else
		extent_item_size = btrfs_file_extent_calc_inline_size(
		    inline_len);

	ret = __btrfs_drop_extents(trans, root, inode, path,
				   start, aligned_end, NULL,
				   1, 1, extent_item_size, &extent_inserted);
319
	if (ret) {
320
		btrfs_abort_transaction(trans, ret);
321
322
		goto out;
	}
323
324
325

	if (isize > actual_end)
		inline_len = min_t(u64, isize, actual_end);
326
327
	ret = insert_inline_extent(trans, path, extent_inserted,
				   root, inode, start,
328
				   inline_len, compressed_size,
329
				   compress_type, compressed_pages);
330
	if (ret && ret != -ENOSPC) {
331
		btrfs_abort_transaction(trans, ret);
332
		goto out;
333
	} else if (ret == -ENOSPC) {
334
335
		ret = 1;
		goto out;
336
	}
337

338
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
339
	btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
340
out:
341
342
343
344
345
346
	/*
	 * Don't forget to free the reserved space, as for inlined extent
	 * it won't count as data extent, free them directly here.
	 * And at reserve time, it's always aligned to page size, so
	 * just free one page here.
	 */
347
	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
348
	btrfs_free_path(path);
349
	btrfs_end_transaction(trans);
350
	return ret;
351
352
}

353
354
355
356
357
358
struct async_extent {
	u64 start;
	u64 ram_size;
	u64 compressed_size;
	struct page **pages;
	unsigned long nr_pages;
359
	int compress_type;
360
361
362
363
364
365
366
367
368
	struct list_head list;
};

struct async_cow {
	struct inode *inode;
	struct btrfs_root *root;
	struct page *locked_page;
	u64 start;
	u64 end;
369
	unsigned int write_flags;
370
371
372
373
374
375
376
377
	struct list_head extents;
	struct btrfs_work work;
};

static noinline int add_async_extent(struct async_cow *cow,
				     u64 start, u64 ram_size,
				     u64 compressed_size,
				     struct page **pages,
378
379
				     unsigned long nr_pages,
				     int compress_type)
380
381
382
383
{
	struct async_extent *async_extent;

	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
384
	BUG_ON(!async_extent); /* -ENOMEM */
385
386
387
388
389
	async_extent->start = start;
	async_extent->ram_size = ram_size;
	async_extent->compressed_size = compressed_size;
	async_extent->pages = pages;
	async_extent->nr_pages = nr_pages;
390
	async_extent->compress_type = compress_type;
391
392
393
394
	list_add_tail(&async_extent->list, &cow->extents);
	return 0;
}

395
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
396
{
397
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
398
399

	/* force compress */
400
	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
401
		return 1;
402
403
404
	/* defrag ioctl */
	if (BTRFS_I(inode)->defrag_compress)
		return 1;
405
406
407
	/* bad compression ratios */
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
		return 0;
408
	if (btrfs_test_opt(fs_info, COMPRESS) ||
409
	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
410
	    BTRFS_I(inode)->prop_compress)
411
		return btrfs_compress_heuristic(inode, start, end);
412
413
414
	return 0;
}

415
static inline void inode_should_defrag(struct btrfs_inode *inode,
416
417
418
419
		u64 start, u64 end, u64 num_bytes, u64 small_write)
{
	/* If this is a small write inside eof, kick off a defrag */
	if (num_bytes < small_write &&
420
	    (start > 0 || end + 1 < inode->disk_i_size))
421
422
423
		btrfs_add_inode_defrag(NULL, inode);
}

Chris Mason's avatar
Chris Mason committed
424
/*
425
426
427
 * we create compressed extents in two phases.  The first
 * phase compresses a range of pages that have already been
 * locked (both pages and state bits are locked).
428
 *
429
430
431
432
433
 * This is done inside an ordered work queue, and the compression
 * is spread across many cpus.  The actual IO submission is step
 * two, and the ordered work queue takes care of making sure that
 * happens in the same order things were put onto the queue by
 * writepages and friends.
434
 *
435
436
437
 * If this code finds it can't get good compression, it puts an
 * entry onto the work queue to write the uncompressed bytes.  This
 * makes sure that both compressed inodes and uncompressed inodes
438
439
 * are written in the same order that the flusher thread sent them
 * down.
Chris Mason's avatar
Chris Mason committed
440
 */
441
static noinline void compress_file_range(struct inode *inode,
442
443
444
445
					struct page *locked_page,
					u64 start, u64 end,
					struct async_cow *async_cow,
					int *num_added)
446
{
447
448
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	u64 blocksize = fs_info->sectorsize;
449
	u64 actual_end;
450
	u64 isize = i_size_read(inode);
451
	int ret = 0;
452
453
454
455
456
457
	struct page **pages = NULL;
	unsigned long nr_pages;
	unsigned long total_compressed = 0;
	unsigned long total_in = 0;
	int i;
	int will_compress;
458
	int compress_type = fs_info->compress_type;
459
	int redirty = 0;
460

461
462
	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
			SZ_16K);
Chris Mason's avatar
Chris Mason committed
463

464
	actual_end = min_t(u64, isize, end + 1);
465
466
again:
	will_compress = 0;
467
	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
468
469
470
	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
	nr_pages = min_t(unsigned long, nr_pages,
			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
471

472
473
474
475
476
477
478
479
480
481
482
483
484
	/*
	 * we don't want to send crud past the end of i_size through
	 * compression, that's just a waste of CPU time.  So, if the
	 * end of the file is before the start of our current
	 * requested range of bytes, we bail out to the uncompressed
	 * cleanup code that can deal with all of this.
	 *
	 * It isn't really the fastest way to fix things, but this is a
	 * very uncommon corner.
	 */
	if (actual_end <= start)
		goto cleanup_and_bail_uncompressed;

485
486
	total_compressed = actual_end - start;

487
488
	/*
	 * skip compression for a small file range(<=blocksize) that
489
	 * isn't an inline extent, since it doesn't save disk space at all.
490
491
492
493
494
	 */
	if (total_compressed <= blocksize &&
	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
		goto cleanup_and_bail_uncompressed;

495
496
	total_compressed = min_t(unsigned long, total_compressed,
			BTRFS_MAX_UNCOMPRESSED);
497
498
	total_in = 0;
	ret = 0;
499

500
501
502
503
	/*
	 * we do compression for mount -o compress and when the
	 * inode has not been flagged as nocompress.  This flag can
	 * change at any time if we discover bad compression ratios.
504
	 */
505
	if (inode_need_compress(inode, start, end)) {
506
		WARN_ON(pages);
507
		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
508
509
510
511
		if (!pages) {
			/* just bail out to the uncompressed code */
			goto cont;
		}
512

513
514
515
		if (BTRFS_I(inode)->defrag_compress)
			compress_type = BTRFS_I(inode)->defrag_compress;
		else if (BTRFS_I(inode)->prop_compress)
516
			compress_type = BTRFS_I(inode)->prop_compress;
517

518
519
520
521
522
523
524
525
		/*
		 * we need to call clear_page_dirty_for_io on each
		 * page in the range.  Otherwise applications with the file
		 * mmap'd can wander in and change the page contents while
		 * we are compressing them.
		 *
		 * If the compression fails for any reason, we set the pages
		 * dirty again later on.
526
527
528
		 *
		 * Note that the remaining part is redirtied, the start pointer
		 * has moved, the end is the original one.
529
		 */
530
531
532
533
		if (!redirty) {
			extent_range_clear_dirty_for_io(inode, start, end);
			redirty = 1;
		}
534
535
536
537

		/* Compression level is applied here and only here */
		ret = btrfs_compress_pages(
			compress_type | (fs_info->compress_level << 4),
538
					   inode->i_mapping, start,
539
					   pages,
540
					   &nr_pages,
541
					   &total_in,
542
					   &total_compressed);
543
544
545

		if (!ret) {
			unsigned long offset = total_compressed &
546
				(PAGE_SIZE - 1);
547
			struct page *page = pages[nr_pages - 1];
548
549
550
551
552
553
			char *kaddr;

			/* zero the tail end of the last page, we might be
			 * sending it down to disk
			 */
			if (offset) {
554
				kaddr = kmap_atomic(page);
555
				memset(kaddr + offset, 0,
556
				       PAGE_SIZE - offset);
557
				kunmap_atomic(kaddr);
558
559
560
561
			}
			will_compress = 1;
		}
	}
562
cont:
563
564
	if (start == 0) {
		/* lets try to make an inline extent */
565
		if (ret || total_in < actual_end) {
566
			/* we didn't compress the entire range, try
567
			 * to make an uncompressed inline extent.
568
			 */
569
570
			ret = cow_file_range_inline(inode, start, end, 0,
						    BTRFS_COMPRESS_NONE, NULL);
571
		} else {
572
			/* try making a compressed inline extent */
573
			ret = cow_file_range_inline(inode, start, end,
574
575
						    total_compressed,
						    compress_type, pages);
576
		}
577
		if (ret <= 0) {
578
			unsigned long clear_flags = EXTENT_DELALLOC |
579
580
				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
				EXTENT_DO_ACCOUNTING;
581
582
583
			unsigned long page_error_op;

			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
584

585
			/*
586
587
588
			 * inline extent creation worked or returned error,
			 * we don't need to create any more async work items.
			 * Unlock and free up our temp pages.
589
590
591
592
593
			 *
			 * We use DO_ACCOUNTING here because we need the
			 * delalloc_release_metadata to be done _after_ we drop
			 * our outstanding extent for clearing delalloc for this
			 * range.
594
			 */
595
596
597
			extent_clear_unlock_delalloc(inode, start, end, end,
						     NULL, clear_flags,
						     PAGE_UNLOCK |
598
599
						     PAGE_CLEAR_DIRTY |
						     PAGE_SET_WRITEBACK |
600
						     page_error_op |
601
						     PAGE_END_WRITEBACK);
602
603
604
605
606
607
608
609
610
611
			goto free_pages_out;
		}
	}

	if (will_compress) {
		/*
		 * we aren't doing an inline extent round the compressed size
		 * up to a block size boundary so the allocator does sane
		 * things
		 */
612
		total_compressed = ALIGN(total_compressed, blocksize);
613
614
615

		/*
		 * one last check to make sure the compression is really a
616
617
		 * win, compare the page count read with the blocks on disk,
		 * compression must free at least one sector size
618
		 */
619
		total_in = ALIGN(total_in, PAGE_SIZE);
620
		if (total_compressed + blocksize <= total_in) {
621
622
623
624
625
626
627
			*num_added += 1;

			/*
			 * The async work queues will take care of doing actual
			 * allocation on disk for these compressed pages, and
			 * will submit them to the elevator.
			 */
628
			add_async_extent(async_cow, start, total_in,
629
					total_compressed, pages, nr_pages,
630
631
					compress_type);

632
633
			if (start + total_in < end) {
				start += total_in;
634
635
636
637
638
				pages = NULL;
				cond_resched();
				goto again;
			}
			return;
639
640
		}
	}
641
	if (pages) {
642
643
644
645
		/*
		 * the compression code ran but failed to make things smaller,
		 * free any pages it allocated and our page pointer array
		 */
646
		for (i = 0; i < nr_pages; i++) {
Chris Mason's avatar
Chris Mason committed
647
			WARN_ON(pages[i]->mapping);
648
			put_page(pages[i]);
649
650
651
652
		}
		kfree(pages);
		pages = NULL;
		total_compressed = 0;
653
		nr_pages = 0;
654
655

		/* flag the file so we don't compress in the future */
656
		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
657
		    !(BTRFS_I(inode)->prop_compress)) {
658
			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
659
		}
660
	}
661
cleanup_and_bail_uncompressed:
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
	/*
	 * No compression, but we still need to write the pages in the file
	 * we've been given so far.  redirty the locked page if it corresponds
	 * to our extent and set things up for the async work queue to run
	 * cow_file_range to do the normal delalloc dance.
	 */
	if (page_offset(locked_page) >= start &&
	    page_offset(locked_page) <= end)
		__set_page_dirty_nobuffers(locked_page);
		/* unlocked later on in the async handlers */

	if (redirty)
		extent_range_redirty_for_io(inode, start, end);
	add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
			 BTRFS_COMPRESS_NONE);
	*num_added += 1;
678

679
	return;
680
681

free_pages_out:
682
	for (i = 0; i < nr_pages; i++) {
683
		WARN_ON(pages[i]->mapping);
684
		put_page(pages[i]);
685
	}
686
	kfree(pages);
687
688
}

689
690
691
692
693
694
695
696
697
static void free_async_extent_pages(struct async_extent *async_extent)
{
	int i;

	if (!async_extent->pages)
		return;

	for (i = 0; i < async_extent->nr_pages; i++) {
		WARN_ON(async_extent->pages[i]->mapping);
698
		put_page(async_extent->pages[i]);
699
700
701
702
	}
	kfree(async_extent->pages);
	async_extent->nr_pages = 0;
	async_extent->pages = NULL;
703
704
705
706
707
708
709
710
}

/*
 * phase two of compressed writeback.  This is the ordered portion
 * of the code, which only gets called in the order the work was
 * queued.  We walk all the async extents created by compress_file_range
 * and send them down to the disk.
 */
711
static noinline void submit_compressed_extents(struct inode *inode,
712
713
					      struct async_cow *async_cow)
{
714
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
715
716
717
718
719
720
	struct async_extent *async_extent;
	u64 alloc_hint = 0;
	struct btrfs_key ins;
	struct extent_map *em;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct extent_io_tree *io_tree;
721
	int ret = 0;
722

723
again:
724
	while (!list_empty(&async_cow->extents)) {
725
726
727
		async_extent = list_entry(async_cow->extents.next,
					  struct async_extent, list);
		list_del(&async_extent->list);
728

729
730
		io_tree = &BTRFS_I(inode)->io_tree;

731
retry:
732
733
734
735
736
737
		/* did the compression code fall back to uncompressed IO? */
		if (!async_extent->pages) {
			int page_started = 0;
			unsigned long nr_written = 0;

			lock_extent(io_tree, async_extent->start,
738
					 async_extent->start +
739
					 async_extent->ram_size - 1);
740
741

			/* allocate blocks */
742
743
744
745
			ret = cow_file_range(inode, async_cow->locked_page,
					     async_extent->start,
					     async_extent->start +
					     async_extent->ram_size - 1,
746
747
748
749
					     async_extent->start +
					     async_extent->ram_size - 1,
					     &page_started, &nr_written, 0,
					     NULL);
750

751
752
			/* JDM XXX */

753
754
755
756
757
758
			/*
			 * if page_started, cow_file_range inserted an
			 * inline extent and took care of all the unlocking
			 * and IO for us.  Otherwise, we need to submit
			 * all those pages down to the drive.
			 */
759
			if (!page_started && !ret)
760
761
				extent_write_locked_range(inode,
						  async_extent->start,
762
						  async_extent->start +
763
764
						  async_extent->ram_size - 1,
						  WB_SYNC_ALL);
765
766
			else if (ret)
				unlock_page(async_cow->locked_page);
767
768
769
770
771
772
			kfree(async_extent);
			cond_resched();
			continue;
		}

		lock_extent(io_tree, async_extent->start,
773
			    async_extent->start + async_extent->ram_size - 1);
774

775
		ret = btrfs_reserve_extent(root, async_extent->ram_size,
776
777
					   async_extent->compressed_size,
					   async_extent->compressed_size,
778
					   0, alloc_hint, &ins, 1, 1);
779
		if (ret) {
780
			free_async_extent_pages(async_extent);
781

782
783
784
785
			if (ret == -ENOSPC) {
				unlock_extent(io_tree, async_extent->start,
					      async_extent->start +
					      async_extent->ram_size - 1);
786
787
788
789
790
791
792
793
794
795
796
797

				/*
				 * we need to redirty the pages if we decide to
				 * fallback to uncompressed IO, otherwise we
				 * will not submit these pages down to lower
				 * layers.
				 */
				extent_range_redirty_for_io(inode,
						async_extent->start,
						async_extent->start +
						async_extent->ram_size - 1);

798
				goto retry;
799
			}
800
			goto out_free;
801
		}
802
803
804
805
		/*
		 * here we're doing allocation and writeback of the
		 * compressed pages
		 */
806
807
808
809
810
811
812
813
814
815
816
		em = create_io_em(inode, async_extent->start,
				  async_extent->ram_size, /* len */
				  async_extent->start, /* orig_start */
				  ins.objectid, /* block_start */
				  ins.offset, /* block_len */
				  ins.offset, /* orig_block_len */
				  async_extent->ram_size, /* ram_bytes */
				  async_extent->compress_type,
				  BTRFS_ORDERED_COMPRESSED);
		if (IS_ERR(em))
			/* ret value is not necessary due to void function */
817
			goto out_free_reserve;
818
		free_extent_map(em);
819

820
821
822
823
824
825
826
		ret = btrfs_add_ordered_extent_compress(inode,
						async_extent->start,
						ins.objectid,
						async_extent->ram_size,
						ins.offset,
						BTRFS_ORDERED_COMPRESSED,
						async_extent->compress_type);
827
		if (ret) {
828
829
			btrfs_drop_extent_cache(BTRFS_I(inode),
						async_extent->start,
830
831
						async_extent->start +
						async_extent->ram_size - 1, 0);
832
			goto out_free_reserve;
833
		}
834
		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
835
836
837
838

		/*
		 * clear dirty, set writeback and unlock the pages.
		 */
839
		extent_clear_unlock_delalloc(inode, async_extent->start,
840
841
				async_extent->start +
				async_extent->ram_size - 1,
842
843
				async_extent->start +
				async_extent->ram_size - 1,
844
845
				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
846
				PAGE_SET_WRITEBACK);
847
		if (btrfs_submit_compressed_write(inode,
848
849
850
851
				    async_extent->start,
				    async_extent->ram_size,
				    ins.objectid,
				    ins.offset, async_extent->pages,
852
853
				    async_extent->nr_pages,
				    async_cow->write_flags)) {
854
855
856
857
858
859
860
861
862
			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
			struct page *p = async_extent->pages[0];
			const u64 start = async_extent->start;
			const u64 end = start + async_extent->ram_size - 1;

			p->mapping = inode->i_mapping;
			tree->ops->writepage_end_io_hook(p, start, end,
							 NULL, 0);
			p->mapping = NULL;
863
864
			extent_clear_unlock_delalloc(inode, start, end, end,
						     NULL, 0,
865
866
						     PAGE_END_WRITEBACK |
						     PAGE_SET_ERROR);
867
			free_async_extent_pages(async_extent);
868
		}
869
870
871
872
		alloc_hint = ins.objectid + ins.offset;
		kfree(async_extent);
		cond_resched();
	}
873
	return;
874
out_free_reserve:
875
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
876
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
877
out_free:
878
	extent_clear_unlock_delalloc(inode, async_extent->start,
879
880
				     async_extent->start +
				     async_extent->ram_size - 1,
881
882
				     async_extent->start +
				     async_extent->ram_size - 1,
883
				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
884
				     EXTENT_DELALLOC_NEW |
885
886
				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
887
888
				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
				     PAGE_SET_ERROR);
889
	free_async_extent_pages(async_extent);
890
	kfree(async_extent);
891
	goto again;
892
893
}

894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
				      u64 num_bytes)
{
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_map *em;
	u64 alloc_hint = 0;

	read_lock(&em_tree->lock);
	em = search_extent_mapping(em_tree, start, num_bytes);
	if (em) {
		/*
		 * if block start isn't an actual block number then find the
		 * first block in this inode and use that as a hint.  If that
		 * block is also bogus then just don't worry about it.
		 */
		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
			free_extent_map(em);
			em = search_extent_mapping(em_tree, 0, 0);
			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
				alloc_hint = em->block_start;
			if (em)
				free_extent_map(em);
		} else {
			alloc_hint = em->block_start;
			free_extent_map(em);
		}
	}
	read_unlock(&em_tree->lock);

	return alloc_hint;
}

926
927
928
929
930
931
932
933
934
935
936
937
938
/*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
 * allocate extents on disk for the range, and create ordered data structs
 * in ram to track those extents.
 *
 * locked_page is the page that writepage had locked already.  We use
 * it to make sure we don't do extra locks or unlocks.
 *
 * *page_started is set to one if we unlock locked_page and do everything
 * required to start IO on it.  It may be clean and already done with
 * IO when we return.
 */
939
940
static noinline int cow_file_range(struct inode *inode,
				   struct page *locked_page,
941
942
943
				   u64 start, u64 end, u64 delalloc_end,
				   int *page_started, unsigned long *nr_written,
				   int unlock, struct btrfs_dedupe_hash *hash)
944
{
945
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
946
	struct btrfs_root *root = BTRFS_I(inode)->root;
947
948
949
	u64 alloc_hint = 0;
	u64 num_bytes;
	unsigned long ram_size;
950
	u64 cur_alloc_size = 0;
951
	u64 blocksize = fs_info->sectorsize;
952
953
	struct btrfs_key ins;
	struct extent_map *em;
954
955
956
	unsigned clear_bits;
	unsigned long page_ops;
	bool extent_reserved = false;
957
958
	int ret = 0;

959
	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
960
		WARN_ON_ONCE(1);
961
962
		ret = -EINVAL;
		goto out_unlock;
963
	}
964

965
	num_bytes = ALIGN(end - start + 1, blocksize);