extent_io.c 154 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2

3
4
5
6
7
8
9
10
11
12
13
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
14
#include <linux/prefetch.h>
15
#include <linux/cleancache.h>
16
#include "extent_io.h"
17
#include "extent-io-tree.h"
18
#include "extent_map.h"
19
20
#include "ctree.h"
#include "btrfs_inode.h"
21
#include "volumes.h"
22
#include "check-integrity.h"
23
#include "locking.h"
24
#include "rcu-string.h"
25
#include "backref.h"
26
#include "disk-io.h"
27
28
29

static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
30
static struct bio_set btrfs_bioset;
31

32
33
34
35
36
static inline bool extent_state_in_tree(const struct extent_state *state)
{
	return !RB_EMPTY_NODE(&state->rb_node);
}

37
#ifdef CONFIG_BTRFS_DEBUG
38
39
static LIST_HEAD(buffers);
static LIST_HEAD(states);
Chris Mason's avatar
Chris Mason committed
40

41
static DEFINE_SPINLOCK(leak_lock);
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

static inline
void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
{
	unsigned long flags;

	spin_lock_irqsave(&leak_lock, flags);
	list_add(new, head);
	spin_unlock_irqrestore(&leak_lock, flags);
}

static inline
void btrfs_leak_debug_del(struct list_head *entry)
{
	unsigned long flags;

	spin_lock_irqsave(&leak_lock, flags);
	list_del(entry);
	spin_unlock_irqrestore(&leak_lock, flags);
}

63
static inline void btrfs_extent_buffer_leak_debug_check(void)
64
65
66
{
	struct extent_buffer *eb;

67
68
69
70
71
72
73
74
75
76
77
78
79
	while (!list_empty(&buffers)) {
		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
		pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
		list_del(&eb->leak_list);
		kmem_cache_free(extent_buffer_cache, eb);
	}
}

static inline void btrfs_extent_state_leak_debug_check(void)
{
	struct extent_state *state;

80
81
	while (!list_empty(&states)) {
		state = list_entry(states.next, struct extent_state, leak_list);
82
		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
83
84
		       state->start, state->end, state->state,
		       extent_state_in_tree(state),
85
		       refcount_read(&state->refs));
86
87
88
89
		list_del(&state->leak_list);
		kmem_cache_free(extent_state_cache, state);
	}
}
90

91
92
#define btrfs_debug_check_extent_io_range(tree, start, end)		\
	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
93
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
94
		struct extent_io_tree *tree, u64 start, u64 end)
95
{
96
97
98
99
100
101
102
103
104
105
106
107
	struct inode *inode = tree->private_data;
	u64 isize;

	if (!inode || !is_data_inode(inode))
		return;

	isize = i_size_read(inode);
	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
	}
108
}
109
110
111
#else
#define btrfs_leak_debug_add(new, head)	do {} while (0)
#define btrfs_leak_debug_del(entry)	do {} while (0)
112
113
#define btrfs_extent_buffer_leak_debug_check()	do {} while (0)
#define btrfs_extent_state_leak_debug_check()	do {} while (0)
114
#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
Chris Mason's avatar
Chris Mason committed
115
#endif
116
117
118
119
120
121
122
123
124
125

struct tree_entry {
	u64 start;
	u64 end;
	struct rb_node rb_node;
};

struct extent_page_data {
	struct bio *bio;
	struct extent_io_tree *tree;
126
127
128
	/* tells writepage not to lock the state bits for this range
	 * it still does the unlocking
	 */
129
130
	unsigned int extent_locked:1;

131
	/* tells the submit_bio code to use REQ_SYNC */
132
	unsigned int sync_io:1;
133
134
};

135
static int add_extent_changeset(struct extent_state *state, unsigned bits,
136
137
138
139
140
141
				 struct extent_changeset *changeset,
				 int set)
{
	int ret;

	if (!changeset)
142
		return 0;
143
	if (set && (state->state & bits) == bits)
144
		return 0;
145
	if (!set && (state->state & bits) == 0)
146
		return 0;
147
	changeset->bytes_changed += state->end - state->start + 1;
148
	ret = ulist_add(&changeset->range_changed, state->start, state->end,
149
			GFP_ATOMIC);
150
	return ret;
151
152
}

153
154
155
156
157
158
159
160
161
162
static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
				       unsigned long bio_flags)
{
	blk_status_t ret = 0;
	struct extent_io_tree *tree = bio->bi_private;

	bio->bi_private = NULL;

	if (tree->ops)
		ret = tree->ops->submit_bio_hook(tree->private_data, bio,
163
						 mirror_num, bio_flags);
164
165
166
167
168
169
	else
		btrfsic_submit_bio(bio);

	return blk_status_to_errno(ret);
}

170
171
172
173
174
175
176
177
178
179
/* Cleanup unsubmitted bios */
static void end_write_bio(struct extent_page_data *epd, int ret)
{
	if (epd->bio) {
		epd->bio->bi_status = errno_to_blk_status(ret);
		bio_endio(epd->bio);
		epd->bio = NULL;
	}
}

180
181
182
183
184
185
186
/*
 * Submit bio from extent page data via submit_one_bio
 *
 * Return 0 if everything is OK.
 * Return <0 for error.
 */
static int __must_check flush_write_bio(struct extent_page_data *epd)
187
{
188
	int ret = 0;
189

190
	if (epd->bio) {
191
		ret = submit_one_bio(epd->bio, 0, 0);
192
193
194
195
196
197
198
		/*
		 * Clean up of epd->bio is handled by its endio function.
		 * And endio is either triggered by successful bio execution
		 * or the error handler of submit bio hook.
		 * So at this point, no matter what happened, we don't need
		 * to clean up epd->bio.
		 */
199
200
		epd->bio = NULL;
	}
201
	return ret;
202
}
203

204
int __init extent_state_cache_init(void)
205
{
206
	extent_state_cache = kmem_cache_create("btrfs_extent_state",
207
			sizeof(struct extent_state), 0,
208
			SLAB_MEM_SPREAD, NULL);
209
210
	if (!extent_state_cache)
		return -ENOMEM;
211
212
	return 0;
}
213

214
215
int __init extent_io_init(void)
{
216
	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
217
			sizeof(struct extent_buffer), 0,
218
			SLAB_MEM_SPREAD, NULL);
219
	if (!extent_buffer_cache)
220
		return -ENOMEM;
221

222
223
224
	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
			offsetof(struct btrfs_io_bio, bio),
			BIOSET_NEED_BVECS))
225
		goto free_buffer_cache;
226

227
	if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
228
229
		goto free_bioset;

230
231
	return 0;

232
free_bioset:
233
	bioset_exit(&btrfs_bioset);
234

235
236
237
free_buffer_cache:
	kmem_cache_destroy(extent_buffer_cache);
	extent_buffer_cache = NULL;
238
239
	return -ENOMEM;
}
240

241
242
243
void __cold extent_state_cache_exit(void)
{
	btrfs_extent_state_leak_debug_check();
244
245
246
	kmem_cache_destroy(extent_state_cache);
}

247
void __cold extent_io_exit(void)
248
{
249
	btrfs_extent_buffer_leak_debug_check();
250
251
252
253
254
255

	/*
	 * Make sure all delayed rcu free are flushed before we
	 * destroy caches.
	 */
	rcu_barrier();
256
	kmem_cache_destroy(extent_buffer_cache);
257
	bioset_exit(&btrfs_bioset);
258
259
}

260
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
261
262
			 struct extent_io_tree *tree, unsigned int owner,
			 void *private_data)
263
{
264
	tree->fs_info = fs_info;
265
	tree->state = RB_ROOT;
266
267
	tree->ops = NULL;
	tree->dirty_bytes = 0;
268
	spin_lock_init(&tree->lock);
269
	tree->private_data = private_data;
270
	tree->owner = owner;
271
272
}

273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
void extent_io_tree_release(struct extent_io_tree *tree)
{
	spin_lock(&tree->lock);
	/*
	 * Do a single barrier for the waitqueue_active check here, the state
	 * of the waitqueue should not change once extent_io_tree_release is
	 * called.
	 */
	smp_mb();
	while (!RB_EMPTY_ROOT(&tree->state)) {
		struct rb_node *node;
		struct extent_state *state;

		node = rb_first(&tree->state);
		state = rb_entry(node, struct extent_state, rb_node);
		rb_erase(&state->rb_node, &tree->state);
		RB_CLEAR_NODE(&state->rb_node);
		/*
		 * btree io trees aren't supposed to have tasks waiting for
		 * changes in the flags of extent states ever.
		 */
		ASSERT(!waitqueue_active(&state->wq));
		free_extent_state(state);

		cond_resched_lock(&tree->lock);
	}
	spin_unlock(&tree->lock);
}

302
static struct extent_state *alloc_extent_state(gfp_t mask)
303
304
305
{
	struct extent_state *state;

306
307
308
309
310
	/*
	 * The given mask might be not appropriate for the slab allocator,
	 * drop the unsupported bits
	 */
	mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
311
	state = kmem_cache_alloc(extent_state_cache, mask);
312
	if (!state)
313
314
		return state;
	state->state = 0;
315
	state->failrec = NULL;
316
	RB_CLEAR_NODE(&state->rb_node);
317
	btrfs_leak_debug_add(&state->leak_list, &states);
318
	refcount_set(&state->refs, 1);
319
	init_waitqueue_head(&state->wq);
320
	trace_alloc_extent_state(state, mask, _RET_IP_);
321
322
323
	return state;
}

324
void free_extent_state(struct extent_state *state)
325
326
327
{
	if (!state)
		return;
328
	if (refcount_dec_and_test(&state->refs)) {
329
		WARN_ON(extent_state_in_tree(state));
330
		btrfs_leak_debug_del(&state->leak_list);
331
		trace_free_extent_state(state, _RET_IP_);
332
333
334
335
		kmem_cache_free(extent_state_cache, state);
	}
}

336
337
338
static struct rb_node *tree_insert(struct rb_root *root,
				   struct rb_node *search_start,
				   u64 offset,
339
340
341
				   struct rb_node *node,
				   struct rb_node ***p_in,
				   struct rb_node **parent_in)
342
{
343
	struct rb_node **p;
344
	struct rb_node *parent = NULL;
345
346
	struct tree_entry *entry;

347
348
349
350
351
352
	if (p_in && parent_in) {
		p = *p_in;
		parent = *parent_in;
		goto do_insert;
	}

353
	p = search_start ? &search_start : &root->rb_node;
354
	while (*p) {
355
356
357
358
359
360
361
362
363
364
365
		parent = *p;
		entry = rb_entry(parent, struct tree_entry, rb_node);

		if (offset < entry->start)
			p = &(*p)->rb_left;
		else if (offset > entry->end)
			p = &(*p)->rb_right;
		else
			return parent;
	}

366
do_insert:
367
368
369
370
371
	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
/**
 * __etree_search - searche @tree for an entry that contains @offset. Such
 * entry would have entry->start <= offset && entry->end >= offset.
 *
 * @tree - the tree to search
 * @offset - offset that should fall within an entry in @tree
 * @next_ret - pointer to the first entry whose range ends after @offset
 * @prev - pointer to the first entry whose range begins before @offset
 * @p_ret - pointer where new node should be anchored (used when inserting an
 *	    entry in the tree)
 * @parent_ret - points to entry which would have been the parent of the entry,
 *               containing @offset
 *
 * This function returns a pointer to the entry that contains @offset byte
 * address. If no such entry exists, then NULL is returned and the other
 * pointer arguments to the function are filled, otherwise the found entry is
 * returned and other pointers are left untouched.
 */
390
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
391
				      struct rb_node **next_ret,
392
				      struct rb_node **prev_ret,
393
394
				      struct rb_node ***p_ret,
				      struct rb_node **parent_ret)
395
{
396
	struct rb_root *root = &tree->state;
397
	struct rb_node **n = &root->rb_node;
398
399
400
401
402
	struct rb_node *prev = NULL;
	struct rb_node *orig_prev = NULL;
	struct tree_entry *entry;
	struct tree_entry *prev_entry = NULL;

403
404
405
	while (*n) {
		prev = *n;
		entry = rb_entry(prev, struct tree_entry, rb_node);
406
407
408
		prev_entry = entry;

		if (offset < entry->start)
409
			n = &(*n)->rb_left;
410
		else if (offset > entry->end)
411
			n = &(*n)->rb_right;
412
		else
413
			return *n;
414
415
	}

416
417
418
419
420
	if (p_ret)
		*p_ret = n;
	if (parent_ret)
		*parent_ret = prev;

421
	if (next_ret) {
422
		orig_prev = prev;
423
		while (prev && offset > prev_entry->end) {
424
425
426
			prev = rb_next(prev);
			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
		}
427
		*next_ret = prev;
428
429
430
		prev = orig_prev;
	}

431
	if (prev_ret) {
432
		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
433
		while (prev && offset < prev_entry->start) {
434
435
436
			prev = rb_prev(prev);
			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
		}
437
		*prev_ret = prev;
438
439
440
441
	}
	return NULL;
}

442
443
444
445
446
static inline struct rb_node *
tree_search_for_insert(struct extent_io_tree *tree,
		       u64 offset,
		       struct rb_node ***p_ret,
		       struct rb_node **parent_ret)
447
{
448
	struct rb_node *next= NULL;
449
	struct rb_node *ret;
450

451
	ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
452
	if (!ret)
453
		return next;
454
455
456
	return ret;
}

457
458
459
460
461
462
static inline struct rb_node *tree_search(struct extent_io_tree *tree,
					  u64 offset)
{
	return tree_search_for_insert(tree, offset, NULL, NULL);
}

463
464
465
466
467
468
469
470
471
/*
 * utility function to look for merge candidates inside a given range.
 * Any extents with matching state are merged together into a single
 * extent in the tree.  Extents with EXTENT_IO in their state field
 * are not merged because the end_io handlers need to be able to do
 * operations on them without sleeping (or doing allocations/splits).
 *
 * This should be called with the tree lock held.
 */
472
473
static void merge_state(struct extent_io_tree *tree,
		        struct extent_state *state)
474
475
476
477
{
	struct extent_state *other;
	struct rb_node *other_node;

Nikolay Borisov's avatar
Nikolay Borisov committed
478
	if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
479
		return;
480
481
482
483
484
485

	other_node = rb_prev(&state->rb_node);
	if (other_node) {
		other = rb_entry(other_node, struct extent_state, rb_node);
		if (other->end == state->start - 1 &&
		    other->state == state->state) {
486
487
488
489
			if (tree->private_data &&
			    is_data_inode(tree->private_data))
				btrfs_merge_delalloc_extent(tree->private_data,
							    state, other);
490
491
			state->start = other->start;
			rb_erase(&other->rb_node, &tree->state);
492
			RB_CLEAR_NODE(&other->rb_node);
493
494
495
496
497
498
499
500
			free_extent_state(other);
		}
	}
	other_node = rb_next(&state->rb_node);
	if (other_node) {
		other = rb_entry(other_node, struct extent_state, rb_node);
		if (other->start == state->end + 1 &&
		    other->state == state->state) {
501
502
503
504
			if (tree->private_data &&
			    is_data_inode(tree->private_data))
				btrfs_merge_delalloc_extent(tree->private_data,
							    state, other);
505
506
			state->end = other->end;
			rb_erase(&other->rb_node, &tree->state);
507
			RB_CLEAR_NODE(&other->rb_node);
508
			free_extent_state(other);
509
510
511
512
		}
	}
}

513
static void set_state_bits(struct extent_io_tree *tree,
514
515
			   struct extent_state *state, unsigned *bits,
			   struct extent_changeset *changeset);
516

517
518
519
520
521
522
523
524
525
526
527
528
/*
 * insert an extent_state struct into the tree.  'bits' are set on the
 * struct before it is inserted.
 *
 * This may return -EEXIST if the extent is already there, in which case the
 * state struct is freed.
 *
 * The tree lock is not taken internally.  This is a utility function and
 * probably isn't what you want to call (see set/clear_extent_bit).
 */
static int insert_state(struct extent_io_tree *tree,
			struct extent_state *state, u64 start, u64 end,
529
530
			struct rb_node ***p,
			struct rb_node **parent,
531
			unsigned *bits, struct extent_changeset *changeset)
532
533
534
{
	struct rb_node *node;

535
536
537
538
539
	if (end < start) {
		btrfs_err(tree->fs_info,
			"insert state: end < start %llu %llu", end, start);
		WARN_ON(1);
	}
540
541
	state->start = start;
	state->end = end;
Josef Bacik's avatar
Josef Bacik committed
542

543
	set_state_bits(tree, state, bits, changeset);
544

545
	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
546
547
548
	if (node) {
		struct extent_state *found;
		found = rb_entry(node, struct extent_state, rb_node);
549
550
		btrfs_err(tree->fs_info,
		       "found node %llu %llu on insert of %llu %llu",
551
		       found->start, found->end, start, end);
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
		return -EEXIST;
	}
	merge_state(tree, state);
	return 0;
}

/*
 * split a given extent state struct in two, inserting the preallocated
 * struct 'prealloc' as the newly created second half.  'split' indicates an
 * offset inside 'orig' where it should be split.
 *
 * Before calling,
 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 * are two extent state structs in the tree:
 * prealloc: [orig->start, split - 1]
 * orig: [ split, orig->end ]
 *
 * The tree locks are not taken by this function. They need to be held
 * by the caller.
 */
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
		       struct extent_state *prealloc, u64 split)
{
	struct rb_node *node;
Josef Bacik's avatar
Josef Bacik committed
576

577
578
	if (tree->private_data && is_data_inode(tree->private_data))
		btrfs_split_delalloc_extent(tree->private_data, orig, split);
Josef Bacik's avatar
Josef Bacik committed
579

580
581
582
583
584
	prealloc->start = orig->start;
	prealloc->end = split - 1;
	prealloc->state = orig->state;
	orig->start = split;

585
586
	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
			   &prealloc->rb_node, NULL, NULL);
587
588
589
590
591
592
593
	if (node) {
		free_extent_state(prealloc);
		return -EEXIST;
	}
	return 0;
}

594
595
596
597
598
599
600
601
602
static struct extent_state *next_state(struct extent_state *state)
{
	struct rb_node *next = rb_next(&state->rb_node);
	if (next)
		return rb_entry(next, struct extent_state, rb_node);
	else
		return NULL;
}

603
604
/*
 * utility function to clear some bits in an extent state struct.
605
 * it will optionally wake up anyone waiting on this state (wake == 1).
606
607
608
609
 *
 * If no bits are set on the state struct after clearing things, the
 * struct is freed and removed from the tree
 */
610
611
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
					    struct extent_state *state,
612
613
					    unsigned *bits, int wake,
					    struct extent_changeset *changeset)
614
{
615
	struct extent_state *next;
616
	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
617
	int ret;
618

619
	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
620
621
622
623
		u64 range = state->end - state->start + 1;
		WARN_ON(range > tree->dirty_bytes);
		tree->dirty_bytes -= range;
	}
624
625
626
627

	if (tree->private_data && is_data_inode(tree->private_data))
		btrfs_clear_delalloc_extent(tree->private_data, state, bits);

628
629
	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
	BUG_ON(ret < 0);
630
	state->state &= ~bits_to_clear;
631
632
	if (wake)
		wake_up(&state->wq);
633
	if (state->state == 0) {
634
		next = next_state(state);
635
		if (extent_state_in_tree(state)) {
636
			rb_erase(&state->rb_node, &tree->state);
637
			RB_CLEAR_NODE(&state->rb_node);
638
639
640
641
642
643
			free_extent_state(state);
		} else {
			WARN_ON(1);
		}
	} else {
		merge_state(tree, state);
644
		next = next_state(state);
645
	}
646
	return next;
647
648
}

649
650
651
652
653
654
655
656
657
static struct extent_state *
alloc_extent_state_atomic(struct extent_state *prealloc)
{
	if (!prealloc)
		prealloc = alloc_extent_state(GFP_ATOMIC);

	return prealloc;
}

658
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
659
{
660
661
662
663
	struct inode *inode = tree->private_data;

	btrfs_panic(btrfs_sb(inode->i_sb), err,
	"locking error: extent tree was modified by another thread while locked");
664
665
}

666
667
668
669
670
671
672
673
674
675
/*
 * clear some bits on a range in the tree.  This may require splitting
 * or inserting elements in the tree, so the gfp mask is used to
 * indicate which allocations or sleeping are allowed.
 *
 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 * the given range from the tree regardless of state (ie for truncate).
 *
 * the range [start, end] is inclusive.
 *
676
 * This takes the tree lock, and returns 0 on success and < 0 on error.
677
 */
678
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
679
680
681
			      unsigned bits, int wake, int delete,
			      struct extent_state **cached_state,
			      gfp_t mask, struct extent_changeset *changeset)
682
683
{
	struct extent_state *state;
684
	struct extent_state *cached;
685
686
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
687
	u64 last_end;
688
	int err;
689
	int clear = 0;
690

691
	btrfs_debug_check_extent_io_range(tree, start, end);
692
	trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
693

694
695
696
	if (bits & EXTENT_DELALLOC)
		bits |= EXTENT_NORESERVE;

697
698
699
	if (delete)
		bits |= ~EXTENT_CTLBITS;

Nikolay Borisov's avatar
Nikolay Borisov committed
700
	if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
701
		clear = 1;
702
again:
703
	if (!prealloc && gfpflags_allow_blocking(mask)) {
704
705
706
707
708
709
710
		/*
		 * Don't care for allocation failure here because we might end
		 * up not needing the pre-allocated extent state at all, which
		 * is the case if we only have in the tree extent states that
		 * cover our input range and don't cover too any other range.
		 * If we end up needing a new extent state we allocate it later.
		 */
711
712
713
		prealloc = alloc_extent_state(mask);
	}

714
	spin_lock(&tree->lock);
715
716
	if (cached_state) {
		cached = *cached_state;
717
718
719
720
721
722

		if (clear) {
			*cached_state = NULL;
			cached_state = NULL;
		}

723
724
		if (cached && extent_state_in_tree(cached) &&
		    cached->start <= start && cached->end > start) {
725
			if (clear)
726
				refcount_dec(&cached->refs);
727
			state = cached;
728
			goto hit_next;
729
		}
730
731
		if (clear)
			free_extent_state(cached);
732
	}
733
734
735
736
	/*
	 * this search will find the extents that end after
	 * our range starts
	 */
737
	node = tree_search(tree, start);
738
739
740
	if (!node)
		goto out;
	state = rb_entry(node, struct extent_state, rb_node);
741
hit_next:
742
743
744
	if (state->start > end)
		goto out;
	WARN_ON(state->end < start);
745
	last_end = state->end;
746

747
	/* the state doesn't have the wanted bits, go ahead */
748
749
	if (!(state->state & bits)) {
		state = next_state(state);
750
		goto next;
751
	}
752

753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
	/*
	 *     | ---- desired range ---- |
	 *  | state | or
	 *  | ------------- state -------------- |
	 *
	 * We need to split the extent we found, and may flip
	 * bits on second half.
	 *
	 * If the extent we found extends past our range, we
	 * just split and search again.  It'll get split again
	 * the next time though.
	 *
	 * If the extent we found is inside our range, we clear
	 * the desired bit on it.
	 */

	if (state->start < start) {
770
771
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
772
		err = split_state(tree, state, prealloc, start);
773
774
775
		if (err)
			extent_io_tree_panic(tree, err);

776
777
778
779
		prealloc = NULL;
		if (err)
			goto out;
		if (state->end <= end) {
780
781
			state = clear_state_bit(tree, state, &bits, wake,
						changeset);
782
			goto next;
783
784
785
786
787
788
789
790
791
792
		}
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *                        | state |
	 * We need to split the extent, and clear the bit
	 * on the first half
	 */
	if (state->start <= end && state->end > end) {
793
794
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
795
		err = split_state(tree, state, prealloc, end + 1);
796
797
798
		if (err)
			extent_io_tree_panic(tree, err);

799
800
		if (wake)
			wake_up(&state->wq);
801

802
		clear_state_bit(tree, prealloc, &bits, wake, changeset);
Josef Bacik's avatar
Josef Bacik committed
803

804
805
806
		prealloc = NULL;
		goto out;
	}
807

808
	state = clear_state_bit(tree, state, &bits, wake, changeset);
809
next:
810
811
812
	if (last_end == (u64)-1)
		goto out;
	start = last_end + 1;
813
	if (start <= end && state && !need_resched())
814
		goto hit_next;
815
816
817
818

search_again:
	if (start > end)
		goto out;
819
	spin_unlock(&tree->lock);
820
	if (gfpflags_allow_blocking(mask))
821
822
		cond_resched();
	goto again;
823
824
825
826
827
828
829
830

out:
	spin_unlock(&tree->lock);
	if (prealloc)
		free_extent_state(prealloc);

	return 0;

831
832
}

833
834
static void wait_on_state(struct extent_io_tree *tree,
			  struct extent_state *state)
835
836
		__releases(tree->lock)
		__acquires(tree->lock)
837
838
839
{
	DEFINE_WAIT(wait);
	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
840
	spin_unlock(&tree->lock);
841
	schedule();
842
	spin_lock(&tree->lock);
843
844
845
846
847
848
849
850
	finish_wait(&state->wq, &wait);
}

/*
 * waits for one or more bits to clear on a range in the state tree.
 * The range [start, end] is inclusive.
 * The tree lock is taken by this function
 */
851
852
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
			    unsigned long bits)
853
854
855
856
{
	struct extent_state *state;
	struct rb_node *node;

857
	btrfs_debug_check_extent_io_range(tree, start, end);
858

859
	spin_lock(&tree->lock);
860
861
862
863
864
865
again:
	while (1) {
		/*
		 * this search will find all the extents that end after
		 * our range starts
		 */
866
		node = tree_search(tree, start);
867
process_node:
868
869
870
871
872
873
874
875
876
877
		if (!node)
			break;

		state = rb_entry(node, struct extent_state, rb_node);

		if (state->start > end)
			goto out;

		if (state->state & bits) {
			start = state->start;
878
			refcount_inc(&state->refs);
879
880
881
882
883
884
885
886
887
			wait_on_state(tree, state);
			free_extent_state(state);
			goto again;
		}
		start = state->end + 1;

		if (start > end)
			break;

888
889
890
891
		if (!cond_resched_lock(&tree->lock)) {
			node = rb_next(node);
			goto process_node;
		}
892
893
	}
out:
894
	spin_unlock(&tree->lock);
895
896
}

897
static void set_state_bits(struct extent_io_tree *tree,
898
			   struct extent_state *state,
899
			   unsigned *bits, struct extent_changeset *changeset)
900
{
901
	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
902
	int ret;
Josef Bacik's avatar
Josef Bacik committed
903

904
905
906
	if (tree->private_data && is_data_inode(tree->private_data))
		btrfs_set_delalloc_extent(tree->private_data, state, bits);

907
	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
908
909
910
		u64 range = state->end - state->start + 1;
		tree->dirty_bytes += range;
	}
911
912
	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
	BUG_ON(ret < 0);
913
	state->state |= bits_to_set;
914
915
}

916
917
static void cache_state_if_flags(struct extent_state *state,
				 struct extent_state **cached_ptr,
918
				 unsigned flags)
919
920
{
	if (cached_ptr && !(*cached_ptr)) {
921
		if (!flags || (state->state & flags)) {
922
			*cached_ptr = state;
923
			refcount_inc(&state->refs);
924
925
926
927
		}
	}
}

928
929
930
931
static void cache_state(struct extent_state *state,
			struct extent_state **cached_ptr)
{
	return cache_state_if_flags(state, cached_ptr,
Nikolay Borisov's avatar
Nikolay Borisov committed
932
				    EXTENT_LOCKED | EXTENT_BOUNDARY);
933
934
}

935
/*
936
937
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
938
 *
939
940
941
 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 * part of the range already has the desired bits set.  The start of the
 * existing range is returned in failed_start in this case.
942
 *
943
 * [start, end] is inclusive This takes the tree lock.
944
 */
945

Jeff Mahoney's avatar
Jeff Mahoney committed
946
947
static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
948
		 unsigned bits, unsigned exclusive_bits,
949
		 u64 *failed_start, struct extent_state **cached_state,
950
		 gfp_t mask, struct extent_changeset *changeset)
951
952
953
954
{
	struct extent_state *state;
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
955
956
	struct rb_node **p;
	struct rb_node *parent;
957
958
959
	int err = 0;
	u64 last_start;
	u64 last_end;
960

961
	btrfs_debug_check_extent_io_range(tree, start, end);
962
	trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
963

964
again:
965
	if (!prealloc && gfpflags_allow_blocking(mask)) {
966
967
968
969
970
971
972
		/*
		 * Don't care for allocation failure here because we might end
		 * up not needing the pre-allocated extent state at all, which
		 * is the case if we only have in the tree extent states that
		 * cover our input range and don't cover too any other range.
		 * If we end up needing a new extent state we allocate it later.
		 */
973
974
975
		prealloc = alloc_extent_state(mask);
	}

976
	spin_lock(&tree->lock);
977
978
	if (cached_state && *cached_state) {
		state = *cached_state;
979
		if (state->start <= start && state->end > start &&
980
		    extent_state_in_tree(state)) {
981
982
983
984
			node = &state->rb_node;
			goto hit_next;
		}
	}
985
986
987
988
	/*
	 * this search will find all the extents that end after
	 * our range starts.
	 */
989
	node = tree_search_for_insert(tree, start, &p, &parent);
990
	if (!node) {
991
992
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
993
		err = insert_state(tree, prealloc, start, end,
994
				   &p, &parent, &bits, changeset);
995
996
997
		if (err)
			extent_io_tree_panic(tree, err);

998
		cache_state(prealloc, cached_state);
999
1000
1001
1002
		prealloc = NULL;
		goto out;
	}
	state = rb_entry(node, struct extent_state, rb_node);
Chris Mason's avatar
Chris Mason committed
1003
hit_next:
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
	last_start = state->start;
	last_end = state->end;

	/*
	 * | ---- desired range ---- |
	 * | state |
	 *
	 * Just lock what we found and keep going
	 */
	if (state->start == start && state->end <= end) {
1014
		if (state->state & exclusive_bits) {
1015
1016
1017
1018
			*failed_start = state->start;
			err = -EEXIST;
			goto out;
		}
1019

1020
		set_state_bits(tree, state, &bits, changeset);
1021
		cache_state(state, cached_state);
1022
		merge_state(tree, state);
1023
1024
1025
		if (last_end == (u64)-1)
			goto out;
		start = last_end + 1;
1026
1027
1028
1029
		state = next_state(state);
		if (start < end && state && state->start == start &&
		    !need_resched())
			goto hit_next;
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
		goto search_again;
	}

	/*
	 *     | ---- desired range ---- |
	 * | state |
	 *   or
	 * | ------------- state -------------- |
	 *
	 * We need to split the extent we found, and may flip bits on
	 * second half.
	 *
	 * If the extent we found extends past our
	 * range, we just split and search again.  It'll get split
	 * again the next time though.
	 *
	 * If the extent we found is inside our range, we set the
	 * desired bit on it.
	 */
	if (state->start < start) {
1050
		if (state->state & exclusive_bits) {
1051
1052
1053
1054
			*failed_start = start;
			err = -EEXIST;
			goto out;
		}
1055
1056
1057

		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
1058
		err = split_state(tree, state, prealloc, start);
1059
1060
1061
		if (err)
			extent_io_tree_panic(tree, err);

1062
1063
1064
1065
		prealloc = NULL;
		if (err)
			goto out;
		if (state->end <= end) {
1066
			set_state_bits(tree, state, &bits, changeset);
Chris Mason's avatar