file.c 92.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Chris Mason's avatar
Chris Mason committed
2
3
4
5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

Chris Mason's avatar
Chris Mason committed
6
7
8
9
10
11
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
12
#include <linux/falloc.h>
Chris Mason's avatar
Chris Mason committed
13
14
#include <linux/writeback.h>
#include <linux/compat.h>
15
#include <linux/slab.h>
16
#include <linux/btrfs.h>
17
#include <linux/uio.h>
18
#include <linux/iversion.h>
Chris Mason's avatar
Chris Mason committed
19
20
21
22
23
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
24
25
#include "tree-log.h"
#include "locking.h"
Josef Bacik's avatar
Josef Bacik committed
26
#include "volumes.h"
Josef Bacik's avatar
Josef Bacik committed
27
#include "qgroup.h"
28
#include "compression.h"
29
#include "delalloc-space.h"
Chris Mason's avatar
Chris Mason committed
30

31
static struct kmem_cache *btrfs_inode_defrag_cachep;
Chris Mason's avatar
Chris Mason committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/*
 * when auto defrag is enabled we
 * queue up these defrag structs to remember which
 * inodes need defragging passes
 */
struct inode_defrag {
	struct rb_node rb_node;
	/* objectid */
	u64 ino;
	/*
	 * transid where the defrag was added, we search for
	 * extents newer than this
	 */
	u64 transid;

	/* root objectid */
	u64 root;

	/* last offset we were able to defrag */
	u64 last_offset;

	/* if we've wrapped around back to zero once already */
	int cycled;
};

57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
static int __compare_inode_defrag(struct inode_defrag *defrag1,
				  struct inode_defrag *defrag2)
{
	if (defrag1->root > defrag2->root)
		return 1;
	else if (defrag1->root < defrag2->root)
		return -1;
	else if (defrag1->ino > defrag2->ino)
		return 1;
	else if (defrag1->ino < defrag2->ino)
		return -1;
	else
		return 0;
}

Chris Mason's avatar
Chris Mason committed
72
73
74
75
76
77
78
79
80
/* pop a record for an inode into the defrag tree.  The lock
 * must be held already
 *
 * If you're inserting a record for an older transid than an
 * existing record, the transid already in the tree is lowered
 *
 * If an existing record is found the defrag item you
 * pass in is freed
 */
81
static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
Chris Mason's avatar
Chris Mason committed
82
83
				    struct inode_defrag *defrag)
{
84
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
Chris Mason's avatar
Chris Mason committed
85
86
87
	struct inode_defrag *entry;
	struct rb_node **p;
	struct rb_node *parent = NULL;
88
	int ret;
Chris Mason's avatar
Chris Mason committed
89

90
	p = &fs_info->defrag_inodes.rb_node;
Chris Mason's avatar
Chris Mason committed
91
92
93
94
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct inode_defrag, rb_node);

95
96
		ret = __compare_inode_defrag(defrag, entry);
		if (ret < 0)
Chris Mason's avatar
Chris Mason committed
97
			p = &parent->rb_left;
98
		else if (ret > 0)
Chris Mason's avatar
Chris Mason committed
99
100
101
102
103
104
105
106
107
108
			p = &parent->rb_right;
		else {
			/* if we're reinserting an entry for
			 * an old defrag run, make sure to
			 * lower the transid of our existing record
			 */
			if (defrag->transid < entry->transid)
				entry->transid = defrag->transid;
			if (defrag->last_offset > entry->last_offset)
				entry->last_offset = defrag->last_offset;
109
			return -EEXIST;
Chris Mason's avatar
Chris Mason committed
110
111
		}
	}
112
	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
Chris Mason's avatar
Chris Mason committed
113
	rb_link_node(&defrag->rb_node, parent, p);
114
	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
115
116
	return 0;
}
Chris Mason's avatar
Chris Mason committed
117

118
static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
119
{
120
	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
121
122
		return 0;

123
	if (btrfs_fs_closing(fs_info))
124
		return 0;
Chris Mason's avatar
Chris Mason committed
125

126
	return 1;
Chris Mason's avatar
Chris Mason committed
127
128
129
130
131
132
133
}

/*
 * insert a defrag record for this inode if auto defrag is
 * enabled
 */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
134
			   struct btrfs_inode *inode)
Chris Mason's avatar
Chris Mason committed
135
{
136
	struct btrfs_root *root = inode->root;
137
	struct btrfs_fs_info *fs_info = root->fs_info;
Chris Mason's avatar
Chris Mason committed
138
139
	struct inode_defrag *defrag;
	u64 transid;
140
	int ret;
Chris Mason's avatar
Chris Mason committed
141

142
	if (!__need_auto_defrag(fs_info))
Chris Mason's avatar
Chris Mason committed
143
144
		return 0;

145
	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
Chris Mason's avatar
Chris Mason committed
146
147
148
149
150
		return 0;

	if (trans)
		transid = trans->transid;
	else
151
		transid = inode->root->last_trans;
Chris Mason's avatar
Chris Mason committed
152

153
	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
Chris Mason's avatar
Chris Mason committed
154
155
156
	if (!defrag)
		return -ENOMEM;

157
	defrag->ino = btrfs_ino(inode);
Chris Mason's avatar
Chris Mason committed
158
159
160
	defrag->transid = transid;
	defrag->root = root->root_key.objectid;

161
	spin_lock(&fs_info->defrag_inodes_lock);
162
	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
163
164
165
166
167
168
169
170
171
		/*
		 * If we set IN_DEFRAG flag and evict the inode from memory,
		 * and then re-read this inode, this new inode doesn't have
		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
		 */
		ret = __btrfs_add_inode_defrag(inode, defrag);
		if (ret)
			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	} else {
172
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
173
	}
174
	spin_unlock(&fs_info->defrag_inodes_lock);
175
	return 0;
Chris Mason's avatar
Chris Mason committed
176
177
178
}

/*
179
180
181
 * Requeue the defrag object. If there is a defrag object that points to
 * the same inode in the tree, we will merge them together (by
 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
Chris Mason's avatar
Chris Mason committed
182
 */
183
static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
184
				       struct inode_defrag *defrag)
185
{
186
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
187
188
	int ret;

189
	if (!__need_auto_defrag(fs_info))
190
191
192
193
194
195
		goto out;

	/*
	 * Here we don't check the IN_DEFRAG flag, because we need merge
	 * them together.
	 */
196
	spin_lock(&fs_info->defrag_inodes_lock);
197
	ret = __btrfs_add_inode_defrag(inode, defrag);
198
	spin_unlock(&fs_info->defrag_inodes_lock);
199
200
201
202
203
204
205
	if (ret)
		goto out;
	return;
out:
	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}

Chris Mason's avatar
Chris Mason committed
206
/*
207
208
 * pick the defragable inode that we want, if it doesn't exist, we will get
 * the next one.
Chris Mason's avatar
Chris Mason committed
209
 */
210
211
static struct inode_defrag *
btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
Chris Mason's avatar
Chris Mason committed
212
213
{
	struct inode_defrag *entry = NULL;
214
	struct inode_defrag tmp;
Chris Mason's avatar
Chris Mason committed
215
216
	struct rb_node *p;
	struct rb_node *parent = NULL;
217
218
219
220
	int ret;

	tmp.ino = ino;
	tmp.root = root;
Chris Mason's avatar
Chris Mason committed
221

222
223
	spin_lock(&fs_info->defrag_inodes_lock);
	p = fs_info->defrag_inodes.rb_node;
Chris Mason's avatar
Chris Mason committed
224
225
226
227
	while (p) {
		parent = p;
		entry = rb_entry(parent, struct inode_defrag, rb_node);

228
229
		ret = __compare_inode_defrag(&tmp, entry);
		if (ret < 0)
Chris Mason's avatar
Chris Mason committed
230
			p = parent->rb_left;
231
		else if (ret > 0)
Chris Mason's avatar
Chris Mason committed
232
233
			p = parent->rb_right;
		else
234
			goto out;
Chris Mason's avatar
Chris Mason committed
235
236
	}

237
238
239
	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
		parent = rb_next(parent);
		if (parent)
Chris Mason's avatar
Chris Mason committed
240
			entry = rb_entry(parent, struct inode_defrag, rb_node);
241
242
		else
			entry = NULL;
Chris Mason's avatar
Chris Mason committed
243
	}
244
245
246
247
248
out:
	if (entry)
		rb_erase(parent, &fs_info->defrag_inodes);
	spin_unlock(&fs_info->defrag_inodes_lock);
	return entry;
Chris Mason's avatar
Chris Mason committed
249
250
}

251
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
Chris Mason's avatar
Chris Mason committed
252
253
{
	struct inode_defrag *defrag;
254
255
256
257
258
259
260
261
262
	struct rb_node *node;

	spin_lock(&fs_info->defrag_inodes_lock);
	node = rb_first(&fs_info->defrag_inodes);
	while (node) {
		rb_erase(node, &fs_info->defrag_inodes);
		defrag = rb_entry(node, struct inode_defrag, rb_node);
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

263
		cond_resched_lock(&fs_info->defrag_inodes_lock);
264
265
266
267
268
269
270
271
272
273
274

		node = rb_first(&fs_info->defrag_inodes);
	}
	spin_unlock(&fs_info->defrag_inodes_lock);
}

#define BTRFS_DEFRAG_BATCH	1024

static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
				    struct inode_defrag *defrag)
{
Chris Mason's avatar
Chris Mason committed
275
276
277
278
279
	struct btrfs_root *inode_root;
	struct inode *inode;
	struct btrfs_key key;
	struct btrfs_ioctl_defrag_range_args range;
	int num_defrag;
280
281
	int index;
	int ret;
Chris Mason's avatar
Chris Mason committed
282

283
284
	/* get the inode */
	key.objectid = defrag->root;
285
	key.type = BTRFS_ROOT_ITEM_KEY;
286
	key.offset = (u64)-1;
287
288
289

	index = srcu_read_lock(&fs_info->subvol_srcu);

290
291
	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
	if (IS_ERR(inode_root)) {
292
293
294
		ret = PTR_ERR(inode_root);
		goto cleanup;
	}
295
296

	key.objectid = defrag->ino;
297
	key.type = BTRFS_INODE_ITEM_KEY;
298
	key.offset = 0;
299
	inode = btrfs_iget(fs_info->sb, &key, inode_root);
300
	if (IS_ERR(inode)) {
301
302
		ret = PTR_ERR(inode);
		goto cleanup;
303
	}
304
	srcu_read_unlock(&fs_info->subvol_srcu, index);
305
306
307

	/* do a chunk of defrag */
	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
Chris Mason's avatar
Chris Mason committed
308
309
	memset(&range, 0, sizeof(range));
	range.len = (u64)-1;
310
	range.start = defrag->last_offset;
Miao Xie's avatar
Miao Xie committed
311
312

	sb_start_write(fs_info->sb);
313
314
	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
				       BTRFS_DEFRAG_BATCH);
Miao Xie's avatar
Miao Xie committed
315
	sb_end_write(fs_info->sb);
316
317
318
319
320
321
322
	/*
	 * if we filled the whole defrag batch, there
	 * must be more work to do.  Queue this defrag
	 * again
	 */
	if (num_defrag == BTRFS_DEFRAG_BATCH) {
		defrag->last_offset = range.start;
323
		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
324
325
326
327
328
329
330
331
	} else if (defrag->last_offset && !defrag->cycled) {
		/*
		 * we didn't fill our defrag batch, but
		 * we didn't start at zero.  Make sure we loop
		 * around to the start of the file.
		 */
		defrag->last_offset = 0;
		defrag->cycled = 1;
332
		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
333
334
335
336
337
338
	} else {
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	}

	iput(inode);
	return 0;
339
340
341
342
cleanup:
	srcu_read_unlock(&fs_info->subvol_srcu, index);
	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	return ret;
343
344
345
346
347
348
349
350
351
352
353
}

/*
 * run through the list of inodes in the FS that need
 * defragging
 */
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
	struct inode_defrag *defrag;
	u64 first_ino = 0;
	u64 root_objectid = 0;
Chris Mason's avatar
Chris Mason committed
354
355

	atomic_inc(&fs_info->defrag_running);
356
	while (1) {
Miao Xie's avatar
Miao Xie committed
357
358
359
360
361
		/* Pause the auto defragger. */
		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
			     &fs_info->fs_state))
			break;

362
		if (!__need_auto_defrag(fs_info))
363
			break;
Chris Mason's avatar
Chris Mason committed
364
365

		/* find an inode to defrag */
366
367
		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
						 first_ino);
Chris Mason's avatar
Chris Mason committed
368
		if (!defrag) {
369
			if (root_objectid || first_ino) {
370
				root_objectid = 0;
Chris Mason's avatar
Chris Mason committed
371
372
373
374
375
376
377
378
				first_ino = 0;
				continue;
			} else {
				break;
			}
		}

		first_ino = defrag->ino + 1;
379
		root_objectid = defrag->root;
Chris Mason's avatar
Chris Mason committed
380

381
		__btrfs_run_defrag_inode(fs_info, defrag);
Chris Mason's avatar
Chris Mason committed
382
383
384
385
386
387
388
389
390
391
	}
	atomic_dec(&fs_info->defrag_running);

	/*
	 * during unmount, we use the transaction_wait queue to
	 * wait for the defragger to stop
	 */
	wake_up(&fs_info->transaction_wait);
	return 0;
}
Chris Mason's avatar
Chris Mason committed
392

Chris Mason's avatar
Chris Mason committed
393
394
395
/* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
396
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
397
					 struct page **prepared_pages,
398
					 struct iov_iter *i)
Chris Mason's avatar
Chris Mason committed
399
{
400
	size_t copied = 0;
Josef Bacik's avatar
Josef Bacik committed
401
	size_t total_copied = 0;
402
	int pg = 0;
403
	int offset = offset_in_page(pos);
Chris Mason's avatar
Chris Mason committed
404

405
	while (write_bytes > 0) {
Chris Mason's avatar
Chris Mason committed
406
		size_t count = min_t(size_t,
407
				     PAGE_SIZE - offset, write_bytes);
408
		struct page *page = prepared_pages[pg];
409
410
411
412
		/*
		 * Copy data from userspace to the current page
		 */
		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
413

Chris Mason's avatar
Chris Mason committed
414
415
		/* Flush processor's dcache for this page */
		flush_dcache_page(page);
416
417
418
419
420
421
422
423
424
425
426
427
428

		/*
		 * if we get a partial write, we can end up with
		 * partially up to date pages.  These add
		 * a lot of complexity, so make sure they don't
		 * happen by forcing this copy to be retried.
		 *
		 * The rest of the btrfs_file_write code will fall
		 * back to page at a time copies after we return 0.
		 */
		if (!PageUptodate(page) && copied < count)
			copied = 0;

429
430
		iov_iter_advance(i, copied);
		write_bytes -= copied;
431
		total_copied += copied;
Chris Mason's avatar
Chris Mason committed
432

Al Viro's avatar
Al Viro committed
433
		/* Return to btrfs_file_write_iter to fault page */
Josef Bacik's avatar
Josef Bacik committed
434
		if (unlikely(copied == 0))
435
			break;
436

437
		if (copied < PAGE_SIZE - offset) {
438
439
440
441
442
			offset += copied;
		} else {
			pg++;
			offset = 0;
		}
Chris Mason's avatar
Chris Mason committed
443
	}
444
	return total_copied;
Chris Mason's avatar
Chris Mason committed
445
446
}

Chris Mason's avatar
Chris Mason committed
447
448
449
/*
 * unlocks pages after btrfs_file_write is done with them
 */
450
static void btrfs_drop_pages(struct page **pages, size_t num_pages)
Chris Mason's avatar
Chris Mason committed
451
452
453
{
	size_t i;
	for (i = 0; i < num_pages; i++) {
Chris Mason's avatar
Chris Mason committed
454
455
		/* page checked is some magic around finding pages that
		 * have been modified without going through btrfs_set_page_dirty
456
457
458
		 * clear it here. There should be no need to mark the pages
		 * accessed as prepare_pages should have marked them accessed
		 * in prepare_pages via find_or_create_page()
Chris Mason's avatar
Chris Mason committed
459
		 */
Chris Mason's avatar
Chris Mason committed
460
		ClearPageChecked(pages[i]);
Chris Mason's avatar
Chris Mason committed
461
		unlock_page(pages[i]);
462
		put_page(pages[i]);
Chris Mason's avatar
Chris Mason committed
463
464
465
	}
}

466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
					 const u64 start,
					 const u64 len,
					 struct extent_state **cached_state)
{
	u64 search_start = start;
	const u64 end = start + len - 1;

	while (search_start < end) {
		const u64 search_len = end - search_start + 1;
		struct extent_map *em;
		u64 em_len;
		int ret = 0;

		em = btrfs_get_extent(inode, NULL, 0, search_start,
				      search_len, 0);
		if (IS_ERR(em))
			return PTR_ERR(em);

		if (em->block_start != EXTENT_MAP_HOLE)
			goto next;

		em_len = em->len;
		if (em->start < search_start)
			em_len -= search_start - em->start;
		if (em_len > search_len)
			em_len = search_len;

		ret = set_extent_bit(&inode->io_tree, search_start,
				     search_start + em_len - 1,
				     EXTENT_DELALLOC_NEW,
				     NULL, cached_state, GFP_NOFS);
next:
		search_start = extent_map_end(em);
		free_extent_map(em);
		if (ret)
			return ret;
	}
	return 0;
}

Chris Mason's avatar
Chris Mason committed
507
508
509
510
511
512
513
514
/*
 * after copy_from_user, pages need to be dirtied and we need to make
 * sure holes are created between the current EOF and the start of
 * any next extents (if required).
 *
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
515
516
517
int btrfs_dirty_pages(struct inode *inode, struct page **pages,
		      size_t num_pages, loff_t pos, size_t write_bytes,
		      struct extent_state **cached)
Chris Mason's avatar
Chris Mason committed
518
{
519
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Chris Mason's avatar
Chris Mason committed
520
	int err = 0;
521
	int i;
522
	u64 num_bytes;
523
524
525
526
	u64 start_pos;
	u64 end_of_last_block;
	u64 end_pos = pos + write_bytes;
	loff_t isize = i_size_read(inode);
527
	unsigned int extra_bits = 0;
Chris Mason's avatar
Chris Mason committed
528

529
	start_pos = pos & ~((u64) fs_info->sectorsize - 1);
530
	num_bytes = round_up(write_bytes + pos - start_pos,
531
			     fs_info->sectorsize);
Chris Mason's avatar
Chris Mason committed
532

533
	end_of_last_block = start_pos + num_bytes - 1;
534

535
536
537
538
539
	/*
	 * The pages may have already been dirty, clear out old accounting so
	 * we can set things up properly
	 */
	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
540
541
			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
			 0, 0, cached);
542

543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
		if (start_pos >= isize &&
		    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
			/*
			 * There can't be any extents following eof in this case
			 * so just set the delalloc new bit for the range
			 * directly.
			 */
			extra_bits |= EXTENT_DELALLOC_NEW;
		} else {
			err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
							    start_pos,
							    num_bytes, cached);
			if (err)
				return err;
		}
	}

561
	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
562
					extra_bits, cached);
Josef Bacik's avatar
Josef Bacik committed
563
564
	if (err)
		return err;
Josef Bacik's avatar
Josef Bacik committed
565

566
567
568
569
570
	for (i = 0; i < num_pages; i++) {
		struct page *p = pages[i];
		SetPageUptodate(p);
		ClearPageChecked(p);
		set_page_dirty(p);
571
	}
Josef Bacik's avatar
Josef Bacik committed
572
573
574
575
576
577
578

	/*
	 * we've only changed i_size in ram, and we haven't updated
	 * the disk i_size.  There is no need to log the inode
	 * at this time.
	 */
	if (end_pos > isize)
579
		i_size_write(inode, end_pos);
580
	return 0;
Chris Mason's avatar
Chris Mason committed
581
582
}

Chris Mason's avatar
Chris Mason committed
583
584
585
586
/*
 * this drops all the extents in the cache that intersect the range
 * [start, end].  Existing extents are split as required.
 */
587
void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
588
			     int skip_pinned)
589
590
{
	struct extent_map *em;
591
592
	struct extent_map *split = NULL;
	struct extent_map *split2 = NULL;
593
	struct extent_map_tree *em_tree = &inode->extent_tree;
594
	u64 len = end - start + 1;
Josef Bacik's avatar
Josef Bacik committed
595
	u64 gen;
596
597
	int ret;
	int testend = 1;
598
	unsigned long flags;
599
	int compressed = 0;
Josef Bacik's avatar
Josef Bacik committed
600
	bool modified;
601

602
	WARN_ON(end < start);
603
	if (end == (u64)-1) {
604
		len = (u64)-1;
605
606
		testend = 0;
	}
607
	while (1) {
608
609
		int no_splits = 0;

Josef Bacik's avatar
Josef Bacik committed
610
		modified = false;
611
		if (!split)
612
			split = alloc_extent_map();
613
		if (!split2)
614
			split2 = alloc_extent_map();
615
616
		if (!split || !split2)
			no_splits = 1;
617

618
		write_lock(&em_tree->lock);
619
		em = lookup_extent_mapping(em_tree, start, len);
620
		if (!em) {
621
			write_unlock(&em_tree->lock);
622
			break;
623
		}
624
		flags = em->flags;
Josef Bacik's avatar
Josef Bacik committed
625
		gen = em->generation;
626
		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
627
			if (testend && em->start + em->len >= start + len) {
628
				free_extent_map(em);
629
				write_unlock(&em_tree->lock);
630
631
				break;
			}
632
633
			start = em->start + em->len;
			if (testend)
634
635
				len = start + len - (em->start + em->len);
			free_extent_map(em);
636
			write_unlock(&em_tree->lock);
637
638
			continue;
		}
639
		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
640
		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
Liu Bo's avatar
Liu Bo committed
641
		clear_bit(EXTENT_FLAG_LOGGING, &flags);
Josef Bacik's avatar
Josef Bacik committed
642
		modified = !list_empty(&em->list);
643
644
		if (no_splits)
			goto next;
645

646
		if (em->start < start) {
647
648
			split->start = em->start;
			split->len = start - em->start;
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668

			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
				split->orig_start = em->orig_start;
				split->block_start = em->block_start;

				if (compressed)
					split->block_len = em->block_len;
				else
					split->block_len = split->len;
				split->orig_block_len = max(split->block_len,
						em->orig_block_len);
				split->ram_bytes = em->ram_bytes;
			} else {
				split->orig_start = split->start;
				split->block_len = 0;
				split->block_start = em->block_start;
				split->orig_block_len = 0;
				split->ram_bytes = split->len;
			}

Josef Bacik's avatar
Josef Bacik committed
669
			split->generation = gen;
670
			split->flags = flags;
671
			split->compress_type = em->compress_type;
672
			replace_extent_mapping(em_tree, em, split, modified);
673
674
675
676
			free_extent_map(split);
			split = split2;
			split2 = NULL;
		}
677
		if (testend && em->start + em->len > start + len) {
678
679
680
681
			u64 diff = start + len - em->start;

			split->start = start + len;
			split->len = em->start + em->len - (start + len);
682
			split->flags = flags;
683
			split->compress_type = em->compress_type;
Josef Bacik's avatar
Josef Bacik committed
684
			split->generation = gen;
685
686
687

			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
				split->orig_block_len = max(em->block_len,
688
						    em->orig_block_len);
689

690
691
692
693
694
695
696
697
698
699
700
				split->ram_bytes = em->ram_bytes;
				if (compressed) {
					split->block_len = em->block_len;
					split->block_start = em->block_start;
					split->orig_start = em->orig_start;
				} else {
					split->block_len = split->len;
					split->block_start = em->block_start
						+ diff;
					split->orig_start = em->orig_start;
				}
701
			} else {
702
703
704
705
706
				split->ram_bytes = split->len;
				split->orig_start = split->start;
				split->block_len = 0;
				split->block_start = em->block_start;
				split->orig_block_len = 0;
707
			}
708

709
710
711
712
713
714
715
716
			if (extent_map_in_tree(em)) {
				replace_extent_mapping(em_tree, em, split,
						       modified);
			} else {
				ret = add_extent_mapping(em_tree, split,
							 modified);
				ASSERT(ret == 0); /* Logic error */
			}
717
718
719
			free_extent_map(split);
			split = NULL;
		}
720
next:
721
722
		if (extent_map_in_tree(em))
			remove_extent_mapping(em_tree, em);
723
		write_unlock(&em_tree->lock);
724

725
726
727
728
729
		/* once for us */
		free_extent_map(em);
		/* once for the tree*/
		free_extent_map(em);
	}
730
731
732
733
	if (split)
		free_extent_map(split);
	if (split2)
		free_extent_map(split2);
734
735
}

Chris Mason's avatar
Chris Mason committed
736
737
738
739
740
741
742
743
744
/*
 * this is very complex, but the basic idea is to drop all extents
 * in the range start - end.  hint_block is filled in with a block number
 * that would be a good hint to the block allocator for this file.
 *
 * If an extent intersects the range but is not entirely inside the range
 * it is either truncated or split.  Anything entirely inside the range
 * is deleted from the tree.
 */
Josef Bacik's avatar
Josef Bacik committed
745
746
747
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
			 struct btrfs_root *root, struct inode *inode,
			 struct btrfs_path *path, u64 start, u64 end,
748
749
750
751
			 u64 *drop_end, int drop_cache,
			 int replace_extent,
			 u32 extent_item_size,
			 int *key_inserted)
Chris Mason's avatar
Chris Mason committed
752
{
753
	struct btrfs_fs_info *fs_info = root->fs_info;
754
	struct extent_buffer *leaf;
Yan, Zheng's avatar
Yan, Zheng committed
755
	struct btrfs_file_extent_item *fi;
756
	struct btrfs_ref ref = { 0 };
757
	struct btrfs_key key;
Yan, Zheng's avatar
Yan, Zheng committed
758
	struct btrfs_key new_key;
759
	u64 ino = btrfs_ino(BTRFS_I(inode));
Yan, Zheng's avatar
Yan, Zheng committed
760
761
762
763
764
	u64 search_start = start;
	u64 disk_bytenr = 0;
	u64 num_bytes = 0;
	u64 extent_offset = 0;
	u64 extent_end = 0;
765
	u64 last_end = start;
Yan, Zheng's avatar
Yan, Zheng committed
766
767
768
	int del_nr = 0;
	int del_slot = 0;
	int extent_type;
Chris Mason's avatar
Chris Mason committed
769
	int recow;
770
	int ret;
771
	int modify_tree = -1;
772
	int update_refs;
773
	int found = 0;
774
	int leafs_visited = 0;
Chris Mason's avatar
Chris Mason committed
775

776
	if (drop_cache)
777
		btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
778

779
	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
780
781
		modify_tree = 0;

782
	update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
783
		       root == fs_info->tree_root);
784
	while (1) {
Chris Mason's avatar
Chris Mason committed
785
		recow = 0;
786
		ret = btrfs_lookup_file_extent(trans, root, path, ino,
787
					       search_start, modify_tree);
Chris Mason's avatar
Chris Mason committed
788
		if (ret < 0)
Yan, Zheng's avatar
Yan, Zheng committed
789
790
791
792
			break;
		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
793
			if (key.objectid == ino &&
Yan, Zheng's avatar
Yan, Zheng committed
794
795
			    key.type == BTRFS_EXTENT_DATA_KEY)
				path->slots[0]--;
Chris Mason's avatar
Chris Mason committed
796
		}
Yan, Zheng's avatar
Yan, Zheng committed
797
		ret = 0;
798
		leafs_visited++;
799
next_slot:
800
		leaf = path->nodes[0];
Yan, Zheng's avatar
Yan, Zheng committed
801
802
803
804
805
806
807
808
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			BUG_ON(del_nr > 0);
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				break;
			if (ret > 0) {
				ret = 0;
				break;
809
			}
810
			leafs_visited++;
Yan, Zheng's avatar
Yan, Zheng committed
811
812
813
814
815
			leaf = path->nodes[0];
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
816
817
818
819
820
821
822
823
824
825

		if (key.objectid > ino)
			break;
		if (WARN_ON_ONCE(key.objectid < ino) ||
		    key.type < BTRFS_EXTENT_DATA_KEY) {
			ASSERT(del_nr == 0);
			path->slots[0]++;
			goto next_slot;
		}
		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
Yan, Zheng's avatar
Yan, Zheng committed
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
			break;

		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		extent_type = btrfs_file_extent_type(leaf, fi);

		if (extent_type == BTRFS_FILE_EXTENT_REG ||
		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
			extent_offset = btrfs_file_extent_offset(leaf, fi);
			extent_end = key.offset +
				btrfs_file_extent_num_bytes(leaf, fi);
		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
			extent_end = key.offset +
841
				btrfs_file_extent_ram_bytes(leaf, fi);
842
		} else {
843
844
			/* can't happen */
			BUG();
Chris Mason's avatar
Chris Mason committed
845
846
		}

847
848
849
850
851
852
853
854
855
		/*
		 * Don't skip extent items representing 0 byte lengths. They
		 * used to be created (bug) if while punching holes we hit
		 * -ENOSPC condition. So if we find one here, just ensure we
		 * delete it, otherwise we would insert a new file extent item
		 * with the same key (offset) as that 0 bytes length file
		 * extent item in the call to setup_items_for_insert() later
		 * in this function.
		 */
856
857
		if (extent_end == key.offset && extent_end >= search_start) {
			last_end = extent_end;
858
			goto delete_extent_item;
859
		}
860

Yan, Zheng's avatar
Yan, Zheng committed
861
862
		if (extent_end <= search_start) {
			path->slots[0]++;
863
			goto next_slot;
Chris Mason's avatar
Chris Mason committed
864
865
		}

866
		found = 1;
Yan, Zheng's avatar
Yan, Zheng committed
867
		search_start = max(key.offset, start);
868
869
		if (recow || !modify_tree) {
			modify_tree = -1;
870
			btrfs_release_path(path);
Yan, Zheng's avatar
Yan, Zheng committed
871
			continue;
Chris Mason's avatar
Chris Mason committed
872
		}
Yan Zheng's avatar
Yan Zheng committed
873

Yan, Zheng's avatar
Yan, Zheng committed
874
875
876
877
878
879
		/*
		 *     | - range to drop - |
		 *  | -------- extent -------- |
		 */
		if (start > key.offset && end < extent_end) {
			BUG_ON(del_nr > 0);
880
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
881
				ret = -EOPNOTSUPP;
882
883
				break;
			}
Yan, Zheng's avatar
Yan, Zheng committed
884
885
886
887
888
889

			memcpy(&new_key, &key, sizeof(new_key));
			new_key.offset = start;
			ret = btrfs_duplicate_item(trans, root, path,
						   &new_key);
			if (ret == -EAGAIN) {
890
				btrfs_release_path(path);
Yan, Zheng's avatar
Yan, Zheng committed
891
				continue;
Yan Zheng's avatar
Yan Zheng committed
892
			}
Yan, Zheng's avatar
Yan, Zheng committed
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
			if (ret < 0)
				break;

			leaf = path->nodes[0];
			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
					    struct btrfs_file_extent_item);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							start - key.offset);

			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);

			extent_offset += start - key.offset;
			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							extent_end - start);
			btrfs_mark_buffer_dirty(leaf);

Josef Bacik's avatar
Josef Bacik committed
911
			if (update_refs && disk_bytenr > 0) {
912
913
914
915
				btrfs_init_generic_ref(&ref,
						BTRFS_ADD_DELAYED_REF,
						disk_bytenr, num_bytes, 0);
				btrfs_init_data_ref(&ref,
Yan, Zheng's avatar
Yan, Zheng committed
916
917
						root->root_key.objectid,
						new_key.objectid,
918
						start - extent_offset);
919
				ret = btrfs_inc_extent_ref(trans, &ref);
920
				BUG_ON(ret); /* -ENOMEM */
921
			}
Yan, Zheng's avatar
Yan, Zheng committed
922
			key.offset = start;
Yan Zheng's avatar
Yan Zheng committed
923
		}
924
925
926
927
928
929
		/*
		 * From here on out we will have actually dropped something, so
		 * last_end can be updated.
		 */
		last_end = extent_end;

Yan, Zheng's avatar
Yan, Zheng committed
930
931
932
933
934
		/*
		 *  | ---- range to drop ----- |
		 *      | -------- extent -------- |
		 */
		if (start <= key.offset && end < extent_end) {
935
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
936
				ret = -EOPNOTSUPP;
937
938
				break;
			}
Yan Zheng's avatar
Yan Zheng committed
939

Yan, Zheng's avatar
Yan, Zheng committed
940
941
			memcpy(&new_key, &key, sizeof(new_key));
			new_key.offset = end;
942
			btrfs_set_item_key_safe(fs_info, path, &new_key);
Yan Zheng's avatar
Yan Zheng committed
943

Yan, Zheng's avatar
Yan, Zheng committed
944
945
946
947
948
			extent_offset += end - key.offset;
			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							extent_end - end);
			btrfs_mark_buffer_dirty(leaf);
949
			if (update_refs && disk_bytenr > 0)
Yan, Zheng's avatar
Yan, Zheng committed
950
951
				inode_sub_bytes(inode, end - key.offset);
			break;
Chris Mason's avatar
Chris Mason committed
952
		}
953

Yan, Zheng's avatar
Yan, Zheng committed
954
955
956
957
958
959
960
		search_start = extent_end;
		/*
		 *       | ---- range to drop ----- |
		 *  | -------- extent -------- |
		 */
		if (start > key.offset && end >= extent_end) {
			BUG_ON(del_nr > 0);
961
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
962
				ret = -EOPNOTSUPP;
963
964
				break;
			}
965

Yan, Zheng's avatar
Yan, Zheng committed
966
967
968
			btrfs_set_file_extent_num_bytes(leaf, fi,
							start - key.offset);
			btrfs_mark_buffer_dirty(leaf);
969
			if (update_refs && disk_bytenr > 0)
Yan, Zheng's avatar
Yan, Zheng committed
970
971
972
				inode_sub_bytes(inode, extent_end - start);
			if (end == extent_end)
				break;
973

Yan, Zheng's avatar
Yan, Zheng committed
974
975
			path->slots[0]++;
			goto next_slot;
Zheng Yan's avatar
Zheng Yan committed
976
977
		}

Yan, Zheng's avatar
Yan, Zheng committed
978
979
980
981
982
		/*
		 *  | ---- range to drop ----- |
		 *    | ------ extent ------ |
		 */
		if (start <= key.offset && end >= extent_end) {
983
delete_extent_item:
Yan, Zheng's avatar
Yan, Zheng committed
984
985
986
987
988
989
990
			if (del_nr == 0) {
				del_slot = path->slots[0];
				del_nr = 1;
			} else {
				BUG_ON(del_slot + del_nr != path->slots[0]);
				del_nr++;
			}
Zheng Yan's avatar
Zheng Yan committed
991

Josef Bacik's avatar
Josef Bacik committed
992
993
			if (update_refs &&
			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
994
				inode_sub_bytes(inode,
Yan, Zheng's avatar
Yan, Zheng committed
995
996
						extent_end - key.offset);
				extent_end = ALIGN(extent_end,
997
						   fs_info->sectorsize);
Josef Bacik's avatar
Josef Bacik committed
998
			} else if (update_refs && disk_bytenr > 0) {
999
1000
1001
1002
				btrfs_init_generic_ref(&ref,
						BTRFS_DROP_DELAYED_REF,
						disk_bytenr, num_bytes, 0);
				btrfs_init_data_ref(&ref,
Yan, Zheng's avatar
Yan, Zheng committed
1003
						root->root_key.objectid,
1004
1005
1006
						key.objectid,
						key.offset - extent_offset);
				ret = btrfs_free_extent(trans, &ref);
1007
				BUG_ON(ret); /* -ENOMEM */
Yan, Zheng's avatar
Yan, Zheng committed
1008
1009
				inode_sub_bytes(inode,
						extent_end - key.offset);
Zheng Yan's avatar
Zheng Yan committed
1010
1011
			}

Yan, Zheng's avatar
Yan, Zheng committed
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
			if (end == extent_end)
				break;

			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
				path->slots[0]++;