inode.c 300 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Chris Mason's avatar
Chris Mason committed
2
3
4
5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

6
#include <linux/kernel.h>
7
#include <linux/bio.h>
Chris Mason's avatar
Chris Mason committed
8
#include <linux/buffer_head.h>
Sage Weil's avatar
Sage Weil committed
9
#include <linux/file.h>
Chris Mason's avatar
Chris Mason committed
10
11
12
13
14
15
16
17
18
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/compat.h>
Josef Bacik's avatar
Josef Bacik committed
19
#include <linux/xattr.h>
Josef Bacik's avatar
Josef Bacik committed
20
#include <linux/posix_acl.h>
Yan Zheng's avatar
Yan Zheng committed
21
#include <linux/falloc.h>
22
#include <linux/slab.h>
23
#include <linux/ratelimit.h>
24
#include <linux/btrfs.h>
David Woodhouse's avatar
David Woodhouse committed
25
#include <linux/blkdev.h>
26
#include <linux/posix_acl_xattr.h>
27
#include <linux/uio.h>
28
#include <linux/magic.h>
29
#include <linux/iversion.h>
Omar Sandoval's avatar
Omar Sandoval committed
30
#include <linux/swap.h>
31
#include <linux/sched/mm.h>
32
#include <asm/unaligned.h>
33
#include "misc.h"
Chris Mason's avatar
Chris Mason committed
34
35
36
37
38
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
39
#include "ordered-data.h"
40
#include "xattr.h"
41
#include "tree-log.h"
42
#include "volumes.h"
43
#include "compression.h"
44
#include "locking.h"
45
#include "free-space-cache.h"
46
#include "inode-map.h"
Liu Bo's avatar
Liu Bo committed
47
#include "backref.h"
48
#include "props.h"
49
#include "qgroup.h"
50
#include "delalloc-space.h"
51
#include "block-group.h"
Chris Mason's avatar
Chris Mason committed
52
53

struct btrfs_iget_args {
54
	struct btrfs_key *location;
Chris Mason's avatar
Chris Mason committed
55
56
57
	struct btrfs_root *root;
};

58
59
60
61
struct btrfs_dio_data {
	u64 reserve;
	u64 unsubmitted_oe_range_start;
	u64 unsubmitted_oe_range_end;
62
	int overwrite;
63
64
};

65
66
67
68
69
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
70
static const struct address_space_operations btrfs_aops;
71
static const struct file_operations btrfs_dir_file_operations;
72
static const struct extent_io_ops btrfs_extent_io_ops;
Chris Mason's avatar
Chris Mason committed
73
74
75
76

static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_path_cachep;
77
struct kmem_cache *btrfs_free_space_cachep;
78
struct kmem_cache *btrfs_free_space_bitmap_cachep;
Chris Mason's avatar
Chris Mason committed
79

80
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
81
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
82
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
83
84
static noinline int cow_file_range(struct inode *inode,
				   struct page *locked_page,
85
				   u64 start, u64 end, int *page_started,
86
				   unsigned long *nr_written, int unlock);
87
88
89
90
91
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
				       u64 orig_start, u64 block_start,
				       u64 block_len, u64 orig_block_len,
				       u64 ram_bytes, int compress_type,
				       int type);
92

93
94
95
96
97
98
static void __endio_write_update_ordered(struct inode *inode,
					 const u64 offset, const u64 bytes,
					 const bool uptodate);

/*
 * Cleanup all submitted ordered extents in specified range to handle errors
99
 * from the btrfs_run_delalloc_range() callback.
100
101
102
103
104
 *
 * NOTE: caller must ensure that when an error happens, it can not call
 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 * to be released, which we want to happen only when finishing the ordered
105
 * extent (btrfs_finish_ordered_io()).
106
107
 */
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
108
109
						 struct page *locked_page,
						 u64 offset, u64 bytes)
110
{
111
112
	unsigned long index = offset >> PAGE_SHIFT;
	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
113
114
115
	u64 page_start = page_offset(locked_page);
	u64 page_end = page_start + PAGE_SIZE - 1;

116
117
118
119
120
121
122
123
124
125
	struct page *page;

	while (index <= end_index) {
		page = find_get_page(inode->i_mapping, index);
		index++;
		if (!page)
			continue;
		ClearPagePrivate2(page);
		put_page(page);
	}
126
127
128
129
130
131
132
133
134
135
136
137

	/*
	 * In case this page belongs to the delalloc range being instantiated
	 * then skip it, since the first page of a range is going to be
	 * properly cleaned up by the caller of run_delalloc_range
	 */
	if (page_start >= offset && page_end <= (offset + bytes - 1)) {
		offset += PAGE_SIZE;
		bytes -= PAGE_SIZE;
	}

	return __endio_write_update_ordered(inode, offset, bytes, false);
138
139
}

140
static int btrfs_dirty_inode(struct inode *inode);
141

142
143
144
145
146
147
148
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_inode_set_ops(struct inode *inode)
{
	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
}
#endif

149
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
150
151
				     struct inode *inode,  struct inode *dir,
				     const struct qstr *qstr)
Jim Owens's avatar
Jim Owens committed
152
153
154
{
	int err;

155
	err = btrfs_init_acl(trans, inode, dir);
Jim Owens's avatar
Jim Owens committed
156
	if (!err)
157
		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
Jim Owens's avatar
Jim Owens committed
158
159
160
	return err;
}

161
162
163
164
165
/*
 * this does all the hard work for inserting an inline extent into
 * the btree.  The caller should have done a btrfs_drop_extents so that
 * no overlapping inline items exist in the btree
 */
166
static int insert_inline_extent(struct btrfs_trans_handle *trans,
167
				struct btrfs_path *path, int extent_inserted,
168
169
				struct btrfs_root *root, struct inode *inode,
				u64 start, size_t size, size_t compressed_size,
170
				int compress_type,
171
172
173
174
175
176
177
178
179
180
181
				struct page **compressed_pages)
{
	struct extent_buffer *leaf;
	struct page *page = NULL;
	char *kaddr;
	unsigned long ptr;
	struct btrfs_file_extent_item *ei;
	int ret;
	size_t cur_size = size;
	unsigned long offset;

182
183
184
	ASSERT((compressed_size > 0 && compressed_pages) ||
	       (compressed_size == 0 && !compressed_pages));

185
	if (compressed_size && compressed_pages)
186
187
		cur_size = compressed_size;

188
	inode_add_bytes(inode, size);
189

190
191
192
	if (!extent_inserted) {
		struct btrfs_key key;
		size_t datasize;
193

194
		key.objectid = btrfs_ino(BTRFS_I(inode));
195
		key.offset = start;
196
		key.type = BTRFS_EXTENT_DATA_KEY;
197

198
199
200
201
		datasize = btrfs_file_extent_calc_inline_size(cur_size);
		path->leave_spinning = 1;
		ret = btrfs_insert_empty_item(trans, root, path, &key,
					      datasize);
202
		if (ret)
203
			goto fail;
204
205
206
207
208
209
210
211
212
213
214
	}
	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
	btrfs_set_file_extent_encryption(leaf, ei, 0);
	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
	ptr = btrfs_file_extent_inline_start(ei);

215
	if (compress_type != BTRFS_COMPRESS_NONE) {
216
217
		struct page *cpage;
		int i = 0;
218
		while (compressed_size > 0) {
219
			cpage = compressed_pages[i];
220
			cur_size = min_t(unsigned long, compressed_size,
221
				       PAGE_SIZE);
222

223
			kaddr = kmap_atomic(cpage);
224
			write_extent_buffer(leaf, kaddr, ptr, cur_size);
225
			kunmap_atomic(kaddr);
226
227
228
229
230
231

			i++;
			ptr += cur_size;
			compressed_size -= cur_size;
		}
		btrfs_set_file_extent_compression(leaf, ei,
232
						  compress_type);
233
234
	} else {
		page = find_get_page(inode->i_mapping,
235
				     start >> PAGE_SHIFT);
236
		btrfs_set_file_extent_compression(leaf, ei, 0);
237
		kaddr = kmap_atomic(page);
238
		offset = offset_in_page(start);
239
		write_extent_buffer(leaf, kaddr + offset, ptr, size);
240
		kunmap_atomic(kaddr);
241
		put_page(page);
242
243
	}
	btrfs_mark_buffer_dirty(leaf);
244
	btrfs_release_path(path);
245

246
247
248
249
250
251
252
253
254
	/*
	 * we're an inline extent, so nobody can
	 * extend the file past i_size without locking
	 * a page we already have locked.
	 *
	 * We must do any isize and inode updates
	 * before we unlock the pages.  Otherwise we
	 * could end up racing with unlink.
	 */
255
	BTRFS_I(inode)->disk_i_size = inode->i_size;
256
	ret = btrfs_update_inode(trans, root, inode);
257

258
fail:
259
	return ret;
260
261
262
263
264
265
266
267
}


/*
 * conditionally insert an inline extent into the file.  This
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 */
268
static noinline int cow_file_range_inline(struct inode *inode, u64 start,
269
270
271
					  u64 end, size_t compressed_size,
					  int compress_type,
					  struct page **compressed_pages)
272
{
273
	struct btrfs_root *root = BTRFS_I(inode)->root;
274
	struct btrfs_fs_info *fs_info = root->fs_info;
275
	struct btrfs_trans_handle *trans;
276
277
278
	u64 isize = i_size_read(inode);
	u64 actual_end = min(end + 1, isize);
	u64 inline_len = actual_end - start;
279
	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
280
281
	u64 data_len = inline_len;
	int ret;
282
283
284
	struct btrfs_path *path;
	int extent_inserted = 0;
	u32 extent_item_size;
285
286
287
288
289

	if (compressed_size)
		data_len = compressed_size;

	if (start > 0 ||
290
291
	    actual_end > fs_info->sectorsize ||
	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
292
	    (!compressed_size &&
293
	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
294
	    end + 1 < isize ||
295
	    data_len > fs_info->max_inline) {
296
297
298
		return 1;
	}

299
300
301
302
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

303
	trans = btrfs_join_transaction(root);
304
305
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
306
		return PTR_ERR(trans);
307
	}
308
	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
309

310
311
312
313
314
315
316
317
318
319
	if (compressed_size && compressed_pages)
		extent_item_size = btrfs_file_extent_calc_inline_size(
		   compressed_size);
	else
		extent_item_size = btrfs_file_extent_calc_inline_size(
		    inline_len);

	ret = __btrfs_drop_extents(trans, root, inode, path,
				   start, aligned_end, NULL,
				   1, 1, extent_item_size, &extent_inserted);
320
	if (ret) {
321
		btrfs_abort_transaction(trans, ret);
322
323
		goto out;
	}
324
325
326

	if (isize > actual_end)
		inline_len = min_t(u64, isize, actual_end);
327
328
	ret = insert_inline_extent(trans, path, extent_inserted,
				   root, inode, start,
329
				   inline_len, compressed_size,
330
				   compress_type, compressed_pages);
331
	if (ret && ret != -ENOSPC) {
332
		btrfs_abort_transaction(trans, ret);
333
		goto out;
334
	} else if (ret == -ENOSPC) {
335
336
		ret = 1;
		goto out;
337
	}
338

339
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
340
	btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
341
out:
342
343
344
345
346
347
	/*
	 * Don't forget to free the reserved space, as for inlined extent
	 * it won't count as data extent, free them directly here.
	 * And at reserve time, it's always aligned to page size, so
	 * just free one page here.
	 */
348
	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
349
	btrfs_free_path(path);
350
	btrfs_end_transaction(trans);
351
	return ret;
352
353
}

354
355
356
357
358
359
struct async_extent {
	u64 start;
	u64 ram_size;
	u64 compressed_size;
	struct page **pages;
	unsigned long nr_pages;
360
	int compress_type;
361
362
363
	struct list_head list;
};

364
struct async_chunk {
365
366
367
368
	struct inode *inode;
	struct page *locked_page;
	u64 start;
	u64 end;
369
	unsigned int write_flags;
370
	struct list_head extents;
371
	struct cgroup_subsys_state *blkcg_css;
372
	struct btrfs_work work;
373
	atomic_t *pending;
374
375
};

376
377
378
379
struct async_cow {
	/* Number of chunks in flight; must be first in the structure */
	atomic_t num_chunks;
	struct async_chunk chunks[];
380
381
};

382
static noinline int add_async_extent(struct async_chunk *cow,
383
384
385
				     u64 start, u64 ram_size,
				     u64 compressed_size,
				     struct page **pages,
386
387
				     unsigned long nr_pages,
				     int compress_type)
388
389
390
391
{
	struct async_extent *async_extent;

	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
392
	BUG_ON(!async_extent); /* -ENOMEM */
393
394
395
396
397
	async_extent->start = start;
	async_extent->ram_size = ram_size;
	async_extent->compressed_size = compressed_size;
	async_extent->pages = pages;
	async_extent->nr_pages = nr_pages;
398
	async_extent->compress_type = compress_type;
399
400
401
402
	list_add_tail(&async_extent->list, &cow->extents);
	return 0;
}

403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
/*
 * Check if the inode has flags compatible with compression
 */
static inline bool inode_can_compress(struct inode *inode)
{
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
	    BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
		return false;
	return true;
}

/*
 * Check if the inode needs to be submitted to compression, based on mount
 * options, defragmentation, properties or heuristics.
 */
418
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
419
{
420
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
421

422
423
424
425
426
427
	if (!inode_can_compress(inode)) {
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
			btrfs_ino(BTRFS_I(inode)));
		return 0;
	}
428
	/* force compress */
429
	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
430
		return 1;
431
432
433
	/* defrag ioctl */
	if (BTRFS_I(inode)->defrag_compress)
		return 1;
434
435
436
	/* bad compression ratios */
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
		return 0;
437
	if (btrfs_test_opt(fs_info, COMPRESS) ||
438
	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
439
	    BTRFS_I(inode)->prop_compress)
440
		return btrfs_compress_heuristic(inode, start, end);
441
442
443
	return 0;
}

444
static inline void inode_should_defrag(struct btrfs_inode *inode,
445
446
447
448
		u64 start, u64 end, u64 num_bytes, u64 small_write)
{
	/* If this is a small write inside eof, kick off a defrag */
	if (num_bytes < small_write &&
449
	    (start > 0 || end + 1 < inode->disk_i_size))
450
451
452
		btrfs_add_inode_defrag(NULL, inode);
}

Chris Mason's avatar
Chris Mason committed
453
/*
454
455
456
 * we create compressed extents in two phases.  The first
 * phase compresses a range of pages that have already been
 * locked (both pages and state bits are locked).
457
 *
458
459
460
461
462
 * This is done inside an ordered work queue, and the compression
 * is spread across many cpus.  The actual IO submission is step
 * two, and the ordered work queue takes care of making sure that
 * happens in the same order things were put onto the queue by
 * writepages and friends.
463
 *
464
465
466
 * If this code finds it can't get good compression, it puts an
 * entry onto the work queue to write the uncompressed bytes.  This
 * makes sure that both compressed inodes and uncompressed inodes
467
468
 * are written in the same order that the flusher thread sent them
 * down.
Chris Mason's avatar
Chris Mason committed
469
 */
470
static noinline int compress_file_range(struct async_chunk *async_chunk)
471
{
472
	struct inode *inode = async_chunk->inode;
473
474
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	u64 blocksize = fs_info->sectorsize;
475
476
	u64 start = async_chunk->start;
	u64 end = async_chunk->end;
477
	u64 actual_end;
478
	u64 i_size;
479
	int ret = 0;
480
481
482
483
484
485
	struct page **pages = NULL;
	unsigned long nr_pages;
	unsigned long total_compressed = 0;
	unsigned long total_in = 0;
	int i;
	int will_compress;
486
	int compress_type = fs_info->compress_type;
487
	int compressed_extents = 0;
488
	int redirty = 0;
489

490
491
	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
			SZ_16K);
Chris Mason's avatar
Chris Mason committed
492

493
494
495
496
497
498
499
500
501
502
503
504
505
	/*
	 * We need to save i_size before now because it could change in between
	 * us evaluating the size and assigning it.  This is because we lock and
	 * unlock the page in truncate and fallocate, and then modify the i_size
	 * later on.
	 *
	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
	 * does that for us.
	 */
	barrier();
	i_size = i_size_read(inode);
	barrier();
	actual_end = min_t(u64, i_size, end + 1);
506
507
again:
	will_compress = 0;
508
	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
509
510
511
	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
	nr_pages = min_t(unsigned long, nr_pages,
			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
512

513
514
515
516
517
518
519
520
521
522
523
524
525
	/*
	 * we don't want to send crud past the end of i_size through
	 * compression, that's just a waste of CPU time.  So, if the
	 * end of the file is before the start of our current
	 * requested range of bytes, we bail out to the uncompressed
	 * cleanup code that can deal with all of this.
	 *
	 * It isn't really the fastest way to fix things, but this is a
	 * very uncommon corner.
	 */
	if (actual_end <= start)
		goto cleanup_and_bail_uncompressed;

526
527
	total_compressed = actual_end - start;

528
529
	/*
	 * skip compression for a small file range(<=blocksize) that
530
	 * isn't an inline extent, since it doesn't save disk space at all.
531
532
533
534
535
	 */
	if (total_compressed <= blocksize &&
	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
		goto cleanup_and_bail_uncompressed;

536
537
	total_compressed = min_t(unsigned long, total_compressed,
			BTRFS_MAX_UNCOMPRESSED);
538
539
	total_in = 0;
	ret = 0;
540

541
542
543
544
	/*
	 * we do compression for mount -o compress and when the
	 * inode has not been flagged as nocompress.  This flag can
	 * change at any time if we discover bad compression ratios.
545
	 */
546
	if (inode_need_compress(inode, start, end)) {
547
		WARN_ON(pages);
548
		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
549
550
		if (!pages) {
			/* just bail out to the uncompressed code */
551
			nr_pages = 0;
552
553
			goto cont;
		}
554

555
556
557
		if (BTRFS_I(inode)->defrag_compress)
			compress_type = BTRFS_I(inode)->defrag_compress;
		else if (BTRFS_I(inode)->prop_compress)
558
			compress_type = BTRFS_I(inode)->prop_compress;
559

560
561
562
563
564
565
566
567
		/*
		 * we need to call clear_page_dirty_for_io on each
		 * page in the range.  Otherwise applications with the file
		 * mmap'd can wander in and change the page contents while
		 * we are compressing them.
		 *
		 * If the compression fails for any reason, we set the pages
		 * dirty again later on.
568
569
570
		 *
		 * Note that the remaining part is redirtied, the start pointer
		 * has moved, the end is the original one.
571
		 */
572
573
574
575
		if (!redirty) {
			extent_range_clear_dirty_for_io(inode, start, end);
			redirty = 1;
		}
576
577
578
579

		/* Compression level is applied here and only here */
		ret = btrfs_compress_pages(
			compress_type | (fs_info->compress_level << 4),
580
					   inode->i_mapping, start,
581
					   pages,
582
					   &nr_pages,
583
					   &total_in,
584
					   &total_compressed);
585
586

		if (!ret) {
587
			unsigned long offset = offset_in_page(total_compressed);
588
			struct page *page = pages[nr_pages - 1];
589
590
591
592
593
594
			char *kaddr;

			/* zero the tail end of the last page, we might be
			 * sending it down to disk
			 */
			if (offset) {
595
				kaddr = kmap_atomic(page);
596
				memset(kaddr + offset, 0,
597
				       PAGE_SIZE - offset);
598
				kunmap_atomic(kaddr);
599
600
601
602
			}
			will_compress = 1;
		}
	}
603
cont:
604
605
	if (start == 0) {
		/* lets try to make an inline extent */
606
		if (ret || total_in < actual_end) {
607
			/* we didn't compress the entire range, try
608
			 * to make an uncompressed inline extent.
609
			 */
610
611
			ret = cow_file_range_inline(inode, start, end, 0,
						    BTRFS_COMPRESS_NONE, NULL);
612
		} else {
613
			/* try making a compressed inline extent */
614
			ret = cow_file_range_inline(inode, start, end,
615
616
						    total_compressed,
						    compress_type, pages);
617
		}
618
		if (ret <= 0) {
619
			unsigned long clear_flags = EXTENT_DELALLOC |
620
621
				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
				EXTENT_DO_ACCOUNTING;
622
623
624
			unsigned long page_error_op;

			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
625

626
			/*
627
628
629
			 * inline extent creation worked or returned error,
			 * we don't need to create any more async work items.
			 * Unlock and free up our temp pages.
630
631
632
633
634
			 *
			 * We use DO_ACCOUNTING here because we need the
			 * delalloc_release_metadata to be done _after_ we drop
			 * our outstanding extent for clearing delalloc for this
			 * range.
635
			 */
636
637
			extent_clear_unlock_delalloc(inode, start, end, NULL,
						     clear_flags,
638
						     PAGE_UNLOCK |
639
640
						     PAGE_CLEAR_DIRTY |
						     PAGE_SET_WRITEBACK |
641
						     page_error_op |
642
						     PAGE_END_WRITEBACK);
643
644
645
646
647
648
649
650

			for (i = 0; i < nr_pages; i++) {
				WARN_ON(pages[i]->mapping);
				put_page(pages[i]);
			}
			kfree(pages);

			return 0;
651
652
653
654
655
656
657
658
659
		}
	}

	if (will_compress) {
		/*
		 * we aren't doing an inline extent round the compressed size
		 * up to a block size boundary so the allocator does sane
		 * things
		 */
660
		total_compressed = ALIGN(total_compressed, blocksize);
661
662
663

		/*
		 * one last check to make sure the compression is really a
664
665
		 * win, compare the page count read with the blocks on disk,
		 * compression must free at least one sector size
666
		 */
667
		total_in = ALIGN(total_in, PAGE_SIZE);
668
		if (total_compressed + blocksize <= total_in) {
669
			compressed_extents++;
670
671
672
673
674
675

			/*
			 * The async work queues will take care of doing actual
			 * allocation on disk for these compressed pages, and
			 * will submit them to the elevator.
			 */
676
			add_async_extent(async_chunk, start, total_in,
677
					total_compressed, pages, nr_pages,
678
679
					compress_type);

680
681
			if (start + total_in < end) {
				start += total_in;
682
683
684
685
				pages = NULL;
				cond_resched();
				goto again;
			}
686
			return compressed_extents;
687
688
		}
	}
689
	if (pages) {
690
691
692
693
		/*
		 * the compression code ran but failed to make things smaller,
		 * free any pages it allocated and our page pointer array
		 */
694
		for (i = 0; i < nr_pages; i++) {
Chris Mason's avatar
Chris Mason committed
695
			WARN_ON(pages[i]->mapping);
696
			put_page(pages[i]);
697
698
699
700
		}
		kfree(pages);
		pages = NULL;
		total_compressed = 0;
701
		nr_pages = 0;
702
703

		/* flag the file so we don't compress in the future */
704
		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
705
		    !(BTRFS_I(inode)->prop_compress)) {
706
			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
707
		}
708
	}
709
cleanup_and_bail_uncompressed:
710
711
712
713
714
715
	/*
	 * No compression, but we still need to write the pages in the file
	 * we've been given so far.  redirty the locked page if it corresponds
	 * to our extent and set things up for the async work queue to run
	 * cow_file_range to do the normal delalloc dance.
	 */
716
717
718
	if (async_chunk->locked_page &&
	    (page_offset(async_chunk->locked_page) >= start &&
	     page_offset(async_chunk->locked_page)) <= end) {
719
		__set_page_dirty_nobuffers(async_chunk->locked_page);
720
		/* unlocked later on in the async handlers */
721
	}
722
723
724

	if (redirty)
		extent_range_redirty_for_io(inode, start, end);
725
	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
726
			 BTRFS_COMPRESS_NONE);
727
	compressed_extents++;
728

729
	return compressed_extents;
730
731
}

732
733
734
735
736
737
738
739
740
static void free_async_extent_pages(struct async_extent *async_extent)
{
	int i;

	if (!async_extent->pages)
		return;

	for (i = 0; i < async_extent->nr_pages; i++) {
		WARN_ON(async_extent->pages[i]->mapping);
741
		put_page(async_extent->pages[i]);
742
743
744
745
	}
	kfree(async_extent->pages);
	async_extent->nr_pages = 0;
	async_extent->pages = NULL;
746
747
748
749
750
751
752
753
}

/*
 * phase two of compressed writeback.  This is the ordered portion
 * of the code, which only gets called in the order the work was
 * queued.  We walk all the async extents created by compress_file_range
 * and send them down to the disk.
 */
754
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
755
{
756
	struct inode *inode = async_chunk->inode;
757
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
758
759
760
761
762
	struct async_extent *async_extent;
	u64 alloc_hint = 0;
	struct btrfs_key ins;
	struct extent_map *em;
	struct btrfs_root *root = BTRFS_I(inode)->root;
763
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
764
	int ret = 0;
765

766
again:
767
768
	while (!list_empty(&async_chunk->extents)) {
		async_extent = list_entry(async_chunk->extents.next,
769
770
					  struct async_extent, list);
		list_del(&async_extent->list);
771

772
retry:
773
774
		lock_extent(io_tree, async_extent->start,
			    async_extent->start + async_extent->ram_size - 1);
775
776
777
778
779
780
		/* did the compression code fall back to uncompressed IO? */
		if (!async_extent->pages) {
			int page_started = 0;
			unsigned long nr_written = 0;

			/* allocate blocks */
781
			ret = cow_file_range(inode, async_chunk->locked_page,
782
783
784
					     async_extent->start,
					     async_extent->start +
					     async_extent->ram_size - 1,
785
					     &page_started, &nr_written, 0);
786

787
788
			/* JDM XXX */

789
790
791
792
793
794
			/*
			 * if page_started, cow_file_range inserted an
			 * inline extent and took care of all the unlocking
			 * and IO for us.  Otherwise, we need to submit
			 * all those pages down to the drive.
			 */
795
			if (!page_started && !ret)
796
797
				extent_write_locked_range(inode,
						  async_extent->start,
798
						  async_extent->start +
799
800
						  async_extent->ram_size - 1,
						  WB_SYNC_ALL);
801
			else if (ret && async_chunk->locked_page)
802
				unlock_page(async_chunk->locked_page);
803
804
805
806
807
			kfree(async_extent);
			cond_resched();
			continue;
		}

808
		ret = btrfs_reserve_extent(root, async_extent->ram_size,
809
810
					   async_extent->compressed_size,
					   async_extent->compressed_size,
811
					   0, alloc_hint, &ins, 1, 1);
812
		if (ret) {
813
			free_async_extent_pages(async_extent);
814

815
816
817
818
			if (ret == -ENOSPC) {
				unlock_extent(io_tree, async_extent->start,
					      async_extent->start +
					      async_extent->ram_size - 1);
819
820
821
822
823
824
825
826
827
828
829
830

				/*
				 * we need to redirty the pages if we decide to
				 * fallback to uncompressed IO, otherwise we
				 * will not submit these pages down to lower
				 * layers.
				 */
				extent_range_redirty_for_io(inode,
						async_extent->start,
						async_extent->start +
						async_extent->ram_size - 1);

831
				goto retry;
832
			}
833
			goto out_free;
834
		}
835
836
837
838
		/*
		 * here we're doing allocation and writeback of the
		 * compressed pages
		 */
839
840
841
842
843
844
845
846
847
848
849
		em = create_io_em(inode, async_extent->start,
				  async_extent->ram_size, /* len */
				  async_extent->start, /* orig_start */
				  ins.objectid, /* block_start */
				  ins.offset, /* block_len */
				  ins.offset, /* orig_block_len */
				  async_extent->ram_size, /* ram_bytes */
				  async_extent->compress_type,
				  BTRFS_ORDERED_COMPRESSED);
		if (IS_ERR(em))
			/* ret value is not necessary due to void function */
850
			goto out_free_reserve;
851
		free_extent_map(em);
852

853
854
855
856
857
858
859
		ret = btrfs_add_ordered_extent_compress(inode,
						async_extent->start,
						ins.objectid,
						async_extent->ram_size,
						ins.offset,
						BTRFS_ORDERED_COMPRESSED,
						async_extent->compress_type);
860
		if (ret) {
861
862
			btrfs_drop_extent_cache(BTRFS_I(inode),
						async_extent->start,
863
864
						async_extent->start +
						async_extent->ram_size - 1, 0);
865
			goto out_free_reserve;
866
		}
867
		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
868
869
870
871

		/*
		 * clear dirty, set writeback and unlock the pages.
		 */
872
		extent_clear_unlock_delalloc(inode, async_extent->start,
873
874
				async_extent->start +
				async_extent->ram_size - 1,
875
876
				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
877
				PAGE_SET_WRITEBACK);
878
		if (btrfs_submit_compressed_write(inode,
879
880
881
882
				    async_extent->start,
				    async_extent->ram_size,
				    ins.objectid,
				    ins.offset, async_extent->pages,
883
				    async_extent->nr_pages,
884
885
				    async_chunk->write_flags,
				    async_chunk->blkcg_css)) {
886
887
888
889
890
			struct page *p = async_extent->pages[0];
			const u64 start = async_extent->start;
			const u64 end = start + async_extent->ram_size - 1;

			p->mapping = inode->i_mapping;
891
			btrfs_writepage_endio_finish_ordered(p, start, end, 0);
892

893
			p->mapping = NULL;
894
			extent_clear_unlock_delalloc(inode, start, end,
895
						     NULL, 0,
896
897
						     PAGE_END_WRITEBACK |
						     PAGE_SET_ERROR);
898
			free_async_extent_pages(async_extent);
899
		}
900
901
902
903
		alloc_hint = ins.objectid + ins.offset;
		kfree(async_extent);
		cond_resched();
	}
904
	return;
905
out_free_reserve:
906
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
907
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
908
out_free:
909
	extent_clear_unlock_delalloc(inode, async_extent->start,
910
911
				     async_extent->start +
				     async_extent->ram_size - 1,
912
				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
913
				     EXTENT_DELALLOC_NEW |
914
915
				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
916
917
				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
				     PAGE_SET_ERROR);
918
	free_async_extent_pages(async_extent);
919
	kfree(async_extent);
920
	goto again;
921
922
}

923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
				      u64 num_bytes)
{
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_map *em;
	u64 alloc_hint = 0;

	read_lock(&em_tree->lock);
	em = search_extent_mapping(em_tree, start,