extent_io.c 154 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2

3
4
5
6
7
8
9
10
11
12
13
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
14
#include <linux/prefetch.h>
15
#include <linux/cleancache.h>
16
17
#include "extent_io.h"
#include "extent_map.h"
18
19
#include "ctree.h"
#include "btrfs_inode.h"
20
#include "volumes.h"
21
#include "check-integrity.h"
22
#include "locking.h"
23
#include "rcu-string.h"
24
#include "backref.h"
25
#include "disk-io.h"
26
27
28

static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
29
static struct bio_set btrfs_bioset;
30

31
32
33
34
35
static inline bool extent_state_in_tree(const struct extent_state *state)
{
	return !RB_EMPTY_NODE(&state->rb_node);
}

36
#ifdef CONFIG_BTRFS_DEBUG
37
38
static LIST_HEAD(buffers);
static LIST_HEAD(states);
Chris Mason's avatar
Chris Mason committed
39

40
static DEFINE_SPINLOCK(leak_lock);
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

static inline
void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
{
	unsigned long flags;

	spin_lock_irqsave(&leak_lock, flags);
	list_add(new, head);
	spin_unlock_irqrestore(&leak_lock, flags);
}

static inline
void btrfs_leak_debug_del(struct list_head *entry)
{
	unsigned long flags;

	spin_lock_irqsave(&leak_lock, flags);
	list_del(entry);
	spin_unlock_irqrestore(&leak_lock, flags);
}

static inline
void btrfs_leak_debug_check(void)
{
	struct extent_state *state;
	struct extent_buffer *eb;

	while (!list_empty(&states)) {
		state = list_entry(states.next, struct extent_state, leak_list);
70
		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
71
72
		       state->start, state->end, state->state,
		       extent_state_in_tree(state),
73
		       refcount_read(&state->refs));
74
75
76
77
78
79
		list_del(&state->leak_list);
		kmem_cache_free(extent_state_cache, state);
	}

	while (!list_empty(&buffers)) {
		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
80
81
		pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
82
83
84
85
		list_del(&eb->leak_list);
		kmem_cache_free(extent_buffer_cache, eb);
	}
}
86

87
88
#define btrfs_debug_check_extent_io_range(tree, start, end)		\
	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
89
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
90
		struct extent_io_tree *tree, u64 start, u64 end)
91
{
92
93
94
95
96
97
98
99
100
101
102
103
	struct inode *inode = tree->private_data;
	u64 isize;

	if (!inode || !is_data_inode(inode))
		return;

	isize = i_size_read(inode);
	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
	}
104
}
105
106
107
108
#else
#define btrfs_leak_debug_add(new, head)	do {} while (0)
#define btrfs_leak_debug_del(entry)	do {} while (0)
#define btrfs_leak_debug_check()	do {} while (0)
109
#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
Chris Mason's avatar
Chris Mason committed
110
#endif
111
112
113
114
115
116
117
118
119
120

struct tree_entry {
	u64 start;
	u64 end;
	struct rb_node rb_node;
};

struct extent_page_data {
	struct bio *bio;
	struct extent_io_tree *tree;
121
122
123
	/* tells writepage not to lock the state bits for this range
	 * it still does the unlocking
	 */
124
125
	unsigned int extent_locked:1;

126
	/* tells the submit_bio code to use REQ_SYNC */
127
	unsigned int sync_io:1;
128
129
};

130
static int add_extent_changeset(struct extent_state *state, unsigned bits,
131
132
133
134
135
136
				 struct extent_changeset *changeset,
				 int set)
{
	int ret;

	if (!changeset)
137
		return 0;
138
	if (set && (state->state & bits) == bits)
139
		return 0;
140
	if (!set && (state->state & bits) == 0)
141
		return 0;
142
	changeset->bytes_changed += state->end - state->start + 1;
143
	ret = ulist_add(&changeset->range_changed, state->start, state->end,
144
			GFP_ATOMIC);
145
	return ret;
146
147
}

148
149
150
151
152
153
154
155
156
157
static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
				       unsigned long bio_flags)
{
	blk_status_t ret = 0;
	struct extent_io_tree *tree = bio->bi_private;

	bio->bi_private = NULL;

	if (tree->ops)
		ret = tree->ops->submit_bio_hook(tree->private_data, bio,
158
						 mirror_num, bio_flags);
159
160
161
162
163
164
	else
		btrfsic_submit_bio(bio);

	return blk_status_to_errno(ret);
}

165
166
167
168
169
170
171
172
173
174
/* Cleanup unsubmitted bios */
static void end_write_bio(struct extent_page_data *epd, int ret)
{
	if (epd->bio) {
		epd->bio->bi_status = errno_to_blk_status(ret);
		bio_endio(epd->bio);
		epd->bio = NULL;
	}
}

175
176
177
178
179
180
181
/*
 * Submit bio from extent page data via submit_one_bio
 *
 * Return 0 if everything is OK.
 * Return <0 for error.
 */
static int __must_check flush_write_bio(struct extent_page_data *epd)
182
{
183
	int ret = 0;
184

185
	if (epd->bio) {
186
		ret = submit_one_bio(epd->bio, 0, 0);
187
188
189
190
191
192
193
		/*
		 * Clean up of epd->bio is handled by its endio function.
		 * And endio is either triggered by successful bio execution
		 * or the error handler of submit bio hook.
		 * So at this point, no matter what happened, we don't need
		 * to clean up epd->bio.
		 */
194
195
		epd->bio = NULL;
	}
196
	return ret;
197
}
198

199
200
int __init extent_io_init(void)
{
201
	extent_state_cache = kmem_cache_create("btrfs_extent_state",
202
			sizeof(struct extent_state), 0,
203
			SLAB_MEM_SPREAD, NULL);
204
205
206
	if (!extent_state_cache)
		return -ENOMEM;

207
	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
208
			sizeof(struct extent_buffer), 0,
209
			SLAB_MEM_SPREAD, NULL);
210
211
	if (!extent_buffer_cache)
		goto free_state_cache;
212

213
214
215
	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
			offsetof(struct btrfs_io_bio, bio),
			BIOSET_NEED_BVECS))
216
		goto free_buffer_cache;
217

218
	if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
219
220
		goto free_bioset;

221
222
	return 0;

223
free_bioset:
224
	bioset_exit(&btrfs_bioset);
225

226
227
228
229
free_buffer_cache:
	kmem_cache_destroy(extent_buffer_cache);
	extent_buffer_cache = NULL;

230
231
free_state_cache:
	kmem_cache_destroy(extent_state_cache);
232
	extent_state_cache = NULL;
233
234
235
	return -ENOMEM;
}

236
void __cold extent_io_exit(void)
237
{
238
	btrfs_leak_debug_check();
239
240
241
242
243
244

	/*
	 * Make sure all delayed rcu free are flushed before we
	 * destroy caches.
	 */
	rcu_barrier();
245
246
	kmem_cache_destroy(extent_state_cache);
	kmem_cache_destroy(extent_buffer_cache);
247
	bioset_exit(&btrfs_bioset);
248
249
}

250
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
251
252
			 struct extent_io_tree *tree, unsigned int owner,
			 void *private_data)
253
{
254
	tree->fs_info = fs_info;
255
	tree->state = RB_ROOT;
256
257
	tree->ops = NULL;
	tree->dirty_bytes = 0;
258
	spin_lock_init(&tree->lock);
259
	tree->private_data = private_data;
260
	tree->owner = owner;
261
262
}

263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
void extent_io_tree_release(struct extent_io_tree *tree)
{
	spin_lock(&tree->lock);
	/*
	 * Do a single barrier for the waitqueue_active check here, the state
	 * of the waitqueue should not change once extent_io_tree_release is
	 * called.
	 */
	smp_mb();
	while (!RB_EMPTY_ROOT(&tree->state)) {
		struct rb_node *node;
		struct extent_state *state;

		node = rb_first(&tree->state);
		state = rb_entry(node, struct extent_state, rb_node);
		rb_erase(&state->rb_node, &tree->state);
		RB_CLEAR_NODE(&state->rb_node);
		/*
		 * btree io trees aren't supposed to have tasks waiting for
		 * changes in the flags of extent states ever.
		 */
		ASSERT(!waitqueue_active(&state->wq));
		free_extent_state(state);

		cond_resched_lock(&tree->lock);
	}
	spin_unlock(&tree->lock);
}

292
static struct extent_state *alloc_extent_state(gfp_t mask)
293
294
295
{
	struct extent_state *state;

296
297
298
299
300
	/*
	 * The given mask might be not appropriate for the slab allocator,
	 * drop the unsupported bits
	 */
	mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
301
	state = kmem_cache_alloc(extent_state_cache, mask);
302
	if (!state)
303
304
		return state;
	state->state = 0;
305
	state->failrec = NULL;
306
	RB_CLEAR_NODE(&state->rb_node);
307
	btrfs_leak_debug_add(&state->leak_list, &states);
308
	refcount_set(&state->refs, 1);
309
	init_waitqueue_head(&state->wq);
310
	trace_alloc_extent_state(state, mask, _RET_IP_);
311
312
313
	return state;
}

314
void free_extent_state(struct extent_state *state)
315
316
317
{
	if (!state)
		return;
318
	if (refcount_dec_and_test(&state->refs)) {
319
		WARN_ON(extent_state_in_tree(state));
320
		btrfs_leak_debug_del(&state->leak_list);
321
		trace_free_extent_state(state, _RET_IP_);
322
323
324
325
		kmem_cache_free(extent_state_cache, state);
	}
}

326
327
328
static struct rb_node *tree_insert(struct rb_root *root,
				   struct rb_node *search_start,
				   u64 offset,
329
330
331
				   struct rb_node *node,
				   struct rb_node ***p_in,
				   struct rb_node **parent_in)
332
{
333
	struct rb_node **p;
334
	struct rb_node *parent = NULL;
335
336
	struct tree_entry *entry;

337
338
339
340
341
342
	if (p_in && parent_in) {
		p = *p_in;
		parent = *parent_in;
		goto do_insert;
	}

343
	p = search_start ? &search_start : &root->rb_node;
344
	while (*p) {
345
346
347
348
349
350
351
352
353
354
355
		parent = *p;
		entry = rb_entry(parent, struct tree_entry, rb_node);

		if (offset < entry->start)
			p = &(*p)->rb_left;
		else if (offset > entry->end)
			p = &(*p)->rb_right;
		else
			return parent;
	}

356
do_insert:
357
358
359
360
361
	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
/**
 * __etree_search - searche @tree for an entry that contains @offset. Such
 * entry would have entry->start <= offset && entry->end >= offset.
 *
 * @tree - the tree to search
 * @offset - offset that should fall within an entry in @tree
 * @next_ret - pointer to the first entry whose range ends after @offset
 * @prev - pointer to the first entry whose range begins before @offset
 * @p_ret - pointer where new node should be anchored (used when inserting an
 *	    entry in the tree)
 * @parent_ret - points to entry which would have been the parent of the entry,
 *               containing @offset
 *
 * This function returns a pointer to the entry that contains @offset byte
 * address. If no such entry exists, then NULL is returned and the other
 * pointer arguments to the function are filled, otherwise the found entry is
 * returned and other pointers are left untouched.
 */
380
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
381
				      struct rb_node **next_ret,
382
				      struct rb_node **prev_ret,
383
384
				      struct rb_node ***p_ret,
				      struct rb_node **parent_ret)
385
{
386
	struct rb_root *root = &tree->state;
387
	struct rb_node **n = &root->rb_node;
388
389
390
391
392
	struct rb_node *prev = NULL;
	struct rb_node *orig_prev = NULL;
	struct tree_entry *entry;
	struct tree_entry *prev_entry = NULL;

393
394
395
	while (*n) {
		prev = *n;
		entry = rb_entry(prev, struct tree_entry, rb_node);
396
397
398
		prev_entry = entry;

		if (offset < entry->start)
399
			n = &(*n)->rb_left;
400
		else if (offset > entry->end)
401
			n = &(*n)->rb_right;
402
		else
403
			return *n;
404
405
	}

406
407
408
409
410
	if (p_ret)
		*p_ret = n;
	if (parent_ret)
		*parent_ret = prev;

411
	if (next_ret) {
412
		orig_prev = prev;
413
		while (prev && offset > prev_entry->end) {
414
415
416
			prev = rb_next(prev);
			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
		}
417
		*next_ret = prev;
418
419
420
		prev = orig_prev;
	}

421
	if (prev_ret) {
422
		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
423
		while (prev && offset < prev_entry->start) {
424
425
426
			prev = rb_prev(prev);
			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
		}
427
		*prev_ret = prev;
428
429
430
431
	}
	return NULL;
}

432
433
434
435
436
static inline struct rb_node *
tree_search_for_insert(struct extent_io_tree *tree,
		       u64 offset,
		       struct rb_node ***p_ret,
		       struct rb_node **parent_ret)
437
{
438
	struct rb_node *next= NULL;
439
	struct rb_node *ret;
440

441
	ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
442
	if (!ret)
443
		return next;
444
445
446
	return ret;
}

447
448
449
450
451
452
static inline struct rb_node *tree_search(struct extent_io_tree *tree,
					  u64 offset)
{
	return tree_search_for_insert(tree, offset, NULL, NULL);
}

453
454
455
456
457
458
459
460
461
/*
 * utility function to look for merge candidates inside a given range.
 * Any extents with matching state are merged together into a single
 * extent in the tree.  Extents with EXTENT_IO in their state field
 * are not merged because the end_io handlers need to be able to do
 * operations on them without sleeping (or doing allocations/splits).
 *
 * This should be called with the tree lock held.
 */
462
463
static void merge_state(struct extent_io_tree *tree,
		        struct extent_state *state)
464
465
466
467
{
	struct extent_state *other;
	struct rb_node *other_node;

Nikolay Borisov's avatar
Nikolay Borisov committed
468
	if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
469
		return;
470
471
472
473
474
475

	other_node = rb_prev(&state->rb_node);
	if (other_node) {
		other = rb_entry(other_node, struct extent_state, rb_node);
		if (other->end == state->start - 1 &&
		    other->state == state->state) {
476
477
478
479
			if (tree->private_data &&
			    is_data_inode(tree->private_data))
				btrfs_merge_delalloc_extent(tree->private_data,
							    state, other);
480
481
			state->start = other->start;
			rb_erase(&other->rb_node, &tree->state);
482
			RB_CLEAR_NODE(&other->rb_node);
483
484
485
486
487
488
489
490
			free_extent_state(other);
		}
	}
	other_node = rb_next(&state->rb_node);
	if (other_node) {
		other = rb_entry(other_node, struct extent_state, rb_node);
		if (other->start == state->end + 1 &&
		    other->state == state->state) {
491
492
493
494
			if (tree->private_data &&
			    is_data_inode(tree->private_data))
				btrfs_merge_delalloc_extent(tree->private_data,
							    state, other);
495
496
			state->end = other->end;
			rb_erase(&other->rb_node, &tree->state);
497
			RB_CLEAR_NODE(&other->rb_node);
498
			free_extent_state(other);
499
500
501
502
		}
	}
}

503
static void set_state_bits(struct extent_io_tree *tree,
504
505
			   struct extent_state *state, unsigned *bits,
			   struct extent_changeset *changeset);
506

507
508
509
510
511
512
513
514
515
516
517
518
/*
 * insert an extent_state struct into the tree.  'bits' are set on the
 * struct before it is inserted.
 *
 * This may return -EEXIST if the extent is already there, in which case the
 * state struct is freed.
 *
 * The tree lock is not taken internally.  This is a utility function and
 * probably isn't what you want to call (see set/clear_extent_bit).
 */
static int insert_state(struct extent_io_tree *tree,
			struct extent_state *state, u64 start, u64 end,
519
520
			struct rb_node ***p,
			struct rb_node **parent,
521
			unsigned *bits, struct extent_changeset *changeset)
522
523
524
{
	struct rb_node *node;

525
526
527
528
529
	if (end < start) {
		btrfs_err(tree->fs_info,
			"insert state: end < start %llu %llu", end, start);
		WARN_ON(1);
	}
530
531
	state->start = start;
	state->end = end;
Josef Bacik's avatar
Josef Bacik committed
532

533
	set_state_bits(tree, state, bits, changeset);
534

535
	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
536
537
538
	if (node) {
		struct extent_state *found;
		found = rb_entry(node, struct extent_state, rb_node);
539
540
		btrfs_err(tree->fs_info,
		       "found node %llu %llu on insert of %llu %llu",
541
		       found->start, found->end, start, end);
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
		return -EEXIST;
	}
	merge_state(tree, state);
	return 0;
}

/*
 * split a given extent state struct in two, inserting the preallocated
 * struct 'prealloc' as the newly created second half.  'split' indicates an
 * offset inside 'orig' where it should be split.
 *
 * Before calling,
 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 * are two extent state structs in the tree:
 * prealloc: [orig->start, split - 1]
 * orig: [ split, orig->end ]
 *
 * The tree locks are not taken by this function. They need to be held
 * by the caller.
 */
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
		       struct extent_state *prealloc, u64 split)
{
	struct rb_node *node;
Josef Bacik's avatar
Josef Bacik committed
566

567
568
	if (tree->private_data && is_data_inode(tree->private_data))
		btrfs_split_delalloc_extent(tree->private_data, orig, split);
Josef Bacik's avatar
Josef Bacik committed
569

570
571
572
573
574
	prealloc->start = orig->start;
	prealloc->end = split - 1;
	prealloc->state = orig->state;
	orig->start = split;

575
576
	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
			   &prealloc->rb_node, NULL, NULL);
577
578
579
580
581
582
583
	if (node) {
		free_extent_state(prealloc);
		return -EEXIST;
	}
	return 0;
}

584
585
586
587
588
589
590
591
592
static struct extent_state *next_state(struct extent_state *state)
{
	struct rb_node *next = rb_next(&state->rb_node);
	if (next)
		return rb_entry(next, struct extent_state, rb_node);
	else
		return NULL;
}

593
594
/*
 * utility function to clear some bits in an extent state struct.
595
 * it will optionally wake up anyone waiting on this state (wake == 1).
596
597
598
599
 *
 * If no bits are set on the state struct after clearing things, the
 * struct is freed and removed from the tree
 */
600
601
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
					    struct extent_state *state,
602
603
					    unsigned *bits, int wake,
					    struct extent_changeset *changeset)
604
{
605
	struct extent_state *next;
606
	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
607
	int ret;
608

609
	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
610
611
612
613
		u64 range = state->end - state->start + 1;
		WARN_ON(range > tree->dirty_bytes);
		tree->dirty_bytes -= range;
	}
614
615
616
617

	if (tree->private_data && is_data_inode(tree->private_data))
		btrfs_clear_delalloc_extent(tree->private_data, state, bits);

618
619
	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
	BUG_ON(ret < 0);
620
	state->state &= ~bits_to_clear;
621
622
	if (wake)
		wake_up(&state->wq);
623
	if (state->state == 0) {
624
		next = next_state(state);
625
		if (extent_state_in_tree(state)) {
626
			rb_erase(&state->rb_node, &tree->state);
627
			RB_CLEAR_NODE(&state->rb_node);
628
629
630
631
632
633
			free_extent_state(state);
		} else {
			WARN_ON(1);
		}
	} else {
		merge_state(tree, state);
634
		next = next_state(state);
635
	}
636
	return next;
637
638
}

639
640
641
642
643
644
645
646
647
static struct extent_state *
alloc_extent_state_atomic(struct extent_state *prealloc)
{
	if (!prealloc)
		prealloc = alloc_extent_state(GFP_ATOMIC);

	return prealloc;
}

648
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
649
{
650
651
652
653
	struct inode *inode = tree->private_data;

	btrfs_panic(btrfs_sb(inode->i_sb), err,
	"locking error: extent tree was modified by another thread while locked");
654
655
}

656
657
658
659
660
661
662
663
664
665
/*
 * clear some bits on a range in the tree.  This may require splitting
 * or inserting elements in the tree, so the gfp mask is used to
 * indicate which allocations or sleeping are allowed.
 *
 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 * the given range from the tree regardless of state (ie for truncate).
 *
 * the range [start, end] is inclusive.
 *
666
 * This takes the tree lock, and returns 0 on success and < 0 on error.
667
 */
668
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
669
670
671
			      unsigned bits, int wake, int delete,
			      struct extent_state **cached_state,
			      gfp_t mask, struct extent_changeset *changeset)
672
673
{
	struct extent_state *state;
674
	struct extent_state *cached;
675
676
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
677
	u64 last_end;
678
	int err;
679
	int clear = 0;
680

681
	btrfs_debug_check_extent_io_range(tree, start, end);
682
	trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
683

684
685
686
	if (bits & EXTENT_DELALLOC)
		bits |= EXTENT_NORESERVE;

687
688
689
	if (delete)
		bits |= ~EXTENT_CTLBITS;

Nikolay Borisov's avatar
Nikolay Borisov committed
690
	if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
691
		clear = 1;
692
again:
693
	if (!prealloc && gfpflags_allow_blocking(mask)) {
694
695
696
697
698
699
700
		/*
		 * Don't care for allocation failure here because we might end
		 * up not needing the pre-allocated extent state at all, which
		 * is the case if we only have in the tree extent states that
		 * cover our input range and don't cover too any other range.
		 * If we end up needing a new extent state we allocate it later.
		 */
701
702
703
		prealloc = alloc_extent_state(mask);
	}

704
	spin_lock(&tree->lock);
705
706
	if (cached_state) {
		cached = *cached_state;
707
708
709
710
711
712

		if (clear) {
			*cached_state = NULL;
			cached_state = NULL;
		}

713
714
		if (cached && extent_state_in_tree(cached) &&
		    cached->start <= start && cached->end > start) {
715
			if (clear)
716
				refcount_dec(&cached->refs);
717
			state = cached;
718
			goto hit_next;
719
		}
720
721
		if (clear)
			free_extent_state(cached);
722
	}
723
724
725
726
	/*
	 * this search will find the extents that end after
	 * our range starts
	 */
727
	node = tree_search(tree, start);
728
729
730
	if (!node)
		goto out;
	state = rb_entry(node, struct extent_state, rb_node);
731
hit_next:
732
733
734
	if (state->start > end)
		goto out;
	WARN_ON(state->end < start);
735
	last_end = state->end;
736

737
	/* the state doesn't have the wanted bits, go ahead */
738
739
	if (!(state->state & bits)) {
		state = next_state(state);
740
		goto next;
741
	}
742

743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
	/*
	 *     | ---- desired range ---- |
	 *  | state | or
	 *  | ------------- state -------------- |
	 *
	 * We need to split the extent we found, and may flip
	 * bits on second half.
	 *
	 * If the extent we found extends past our range, we
	 * just split and search again.  It'll get split again
	 * the next time though.
	 *
	 * If the extent we found is inside our range, we clear
	 * the desired bit on it.
	 */

	if (state->start < start) {
760
761
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
762
		err = split_state(tree, state, prealloc, start);
763
764
765
		if (err)
			extent_io_tree_panic(tree, err);

766
767
768
769
		prealloc = NULL;
		if (err)
			goto out;
		if (state->end <= end) {
770
771
			state = clear_state_bit(tree, state, &bits, wake,
						changeset);
772
			goto next;
773
774
775
776
777
778
779
780
781
782
		}
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *                        | state |
	 * We need to split the extent, and clear the bit
	 * on the first half
	 */
	if (state->start <= end && state->end > end) {
783
784
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
785
		err = split_state(tree, state, prealloc, end + 1);
786
787
788
		if (err)
			extent_io_tree_panic(tree, err);

789
790
		if (wake)
			wake_up(&state->wq);
791

792
		clear_state_bit(tree, prealloc, &bits, wake, changeset);
Josef Bacik's avatar
Josef Bacik committed
793

794
795
796
		prealloc = NULL;
		goto out;
	}
797

798
	state = clear_state_bit(tree, state, &bits, wake, changeset);
799
next:
800
801
802
	if (last_end == (u64)-1)
		goto out;
	start = last_end + 1;
803
	if (start <= end && state && !need_resched())
804
		goto hit_next;
805
806
807
808

search_again:
	if (start > end)
		goto out;
809
	spin_unlock(&tree->lock);
810
	if (gfpflags_allow_blocking(mask))
811
812
		cond_resched();
	goto again;
813
814
815
816
817
818
819
820

out:
	spin_unlock(&tree->lock);
	if (prealloc)
		free_extent_state(prealloc);

	return 0;

821
822
}

823
824
static void wait_on_state(struct extent_io_tree *tree,
			  struct extent_state *state)
825
826
		__releases(tree->lock)
		__acquires(tree->lock)
827
828
829
{
	DEFINE_WAIT(wait);
	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
830
	spin_unlock(&tree->lock);
831
	schedule();
832
	spin_lock(&tree->lock);
833
834
835
836
837
838
839
840
	finish_wait(&state->wq, &wait);
}

/*
 * waits for one or more bits to clear on a range in the state tree.
 * The range [start, end] is inclusive.
 * The tree lock is taken by this function
 */
841
842
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
			    unsigned long bits)
843
844
845
846
{
	struct extent_state *state;
	struct rb_node *node;

847
	btrfs_debug_check_extent_io_range(tree, start, end);
848

849
	spin_lock(&tree->lock);
850
851
852
853
854
855
again:
	while (1) {
		/*
		 * this search will find all the extents that end after
		 * our range starts
		 */
856
		node = tree_search(tree, start);
857
process_node:
858
859
860
861
862
863
864
865
866
867
		if (!node)
			break;

		state = rb_entry(node, struct extent_state, rb_node);

		if (state->start > end)
			goto out;

		if (state->state & bits) {
			start = state->start;
868
			refcount_inc(&state->refs);
869
870
871
872
873
874
875
876
877
			wait_on_state(tree, state);
			free_extent_state(state);
			goto again;
		}
		start = state->end + 1;

		if (start > end)
			break;

878
879
880
881
		if (!cond_resched_lock(&tree->lock)) {
			node = rb_next(node);
			goto process_node;
		}
882
883
	}
out:
884
	spin_unlock(&tree->lock);
885
886
}

887
static void set_state_bits(struct extent_io_tree *tree,
888
			   struct extent_state *state,
889
			   unsigned *bits, struct extent_changeset *changeset)
890
{
891
	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
892
	int ret;
Josef Bacik's avatar
Josef Bacik committed
893

894
895
896
	if (tree->private_data && is_data_inode(tree->private_data))
		btrfs_set_delalloc_extent(tree->private_data, state, bits);

897
	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
898
899
900
		u64 range = state->end - state->start + 1;
		tree->dirty_bytes += range;
	}
901
902
	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
	BUG_ON(ret < 0);
903
	state->state |= bits_to_set;
904
905
}

906
907
static void cache_state_if_flags(struct extent_state *state,
				 struct extent_state **cached_ptr,
908
				 unsigned flags)
909
910
{
	if (cached_ptr && !(*cached_ptr)) {
911
		if (!flags || (state->state & flags)) {
912
			*cached_ptr = state;
913
			refcount_inc(&state->refs);
914
915
916
917
		}
	}
}

918
919
920
921
static void cache_state(struct extent_state *state,
			struct extent_state **cached_ptr)
{
	return cache_state_if_flags(state, cached_ptr,
Nikolay Borisov's avatar
Nikolay Borisov committed
922
				    EXTENT_LOCKED | EXTENT_BOUNDARY);
923
924
}

925
/*
926
927
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
928
 *
929
930
931
 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 * part of the range already has the desired bits set.  The start of the
 * existing range is returned in failed_start in this case.
932
 *
933
 * [start, end] is inclusive This takes the tree lock.
934
 */
935

Jeff Mahoney's avatar
Jeff Mahoney committed
936
937
static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
938
		 unsigned bits, unsigned exclusive_bits,
939
		 u64 *failed_start, struct extent_state **cached_state,
940
		 gfp_t mask, struct extent_changeset *changeset)
941
942
943
944
{
	struct extent_state *state;
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
945
946
	struct rb_node **p;
	struct rb_node *parent;
947
948
949
	int err = 0;
	u64 last_start;
	u64 last_end;
950

951
	btrfs_debug_check_extent_io_range(tree, start, end);
952
	trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
953

954
again:
955
	if (!prealloc && gfpflags_allow_blocking(mask)) {
956
957
958
959
960
961
962
		/*
		 * Don't care for allocation failure here because we might end
		 * up not needing the pre-allocated extent state at all, which
		 * is the case if we only have in the tree extent states that
		 * cover our input range and don't cover too any other range.
		 * If we end up needing a new extent state we allocate it later.
		 */
963
964
965
		prealloc = alloc_extent_state(mask);
	}

966
	spin_lock(&tree->lock);
967
968
	if (cached_state && *cached_state) {
		state = *cached_state;
969
		if (state->start <= start && state->end > start &&
970
		    extent_state_in_tree(state)) {
971
972
973
974
			node = &state->rb_node;
			goto hit_next;
		}
	}
975
976
977
978
	/*
	 * this search will find all the extents that end after
	 * our range starts.
	 */
979
	node = tree_search_for_insert(tree, start, &p, &parent);
980
	if (!node) {
981
982
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
983
		err = insert_state(tree, prealloc, start, end,
984
				   &p, &parent, &bits, changeset);
985
986
987
		if (err)
			extent_io_tree_panic(tree, err);

988
		cache_state(prealloc, cached_state);
989
990
991
992
		prealloc = NULL;
		goto out;
	}
	state = rb_entry(node, struct extent_state, rb_node);
Chris Mason's avatar
Chris Mason committed
993
hit_next:
994
995
996
997
998
999
1000
1001
1002
1003
	last_start = state->start;
	last_end = state->end;

	/*
	 * | ---- desired range ---- |
	 * | state |
	 *
	 * Just lock what we found and keep going
	 */
	if (state->start == start && state->end <= end) {
1004
		if (state->state & exclusive_bits) {
1005
1006
1007
1008
			*failed_start = state->start;
			err = -EEXIST;
			goto out;
		}
1009

1010
		set_state_bits(tree, state, &bits, changeset);
1011
		cache_state(state, cached_state);
1012
		merge_state(tree, state);
1013
1014
1015
		if (last_end == (u64)-1)
			goto out;
		start = last_end + 1;
1016
1017
1018
1019
		state = next_state(state);
		if (start < end && state && state->start == start &&
		    !need_resched())
			goto hit_next;
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
		goto search_again;
	}

	/*
	 *     | ---- desired range ---- |
	 * | state |
	 *   or
	 * | ------------- state -------------- |
	 *
	 * We need to split the extent we found, and may flip bits on
	 * second half.
	 *
	 * If the extent we found extends past our
	 * range, we just split and search again.  It'll get split
	 * again the next time though.
	 *
	 * If the extent we found is inside our range, we set the
	 * desired bit on it.
	 */
	if (state->start < start) {
1040
		if (state->state & exclusive_bits) {
1041
1042
1043
1044
			*failed_start = start;
			err = -EEXIST;
			goto out;
		}
1045
1046
1047

		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
1048
		err = split_state(tree, state, prealloc, start);
1049
1050
1051
		if (err)
			extent_io_tree_panic(tree, err);

1052
1053
1054
1055
		prealloc = NULL;
		if (err)
			goto out;
		if (state->end <= end) {
1056
			set_state_bits(tree, state, &bits, changeset);
1057
			cache_state(state, cached_state);
1058
			merge_state(tree, state);
1059
1060
1061
			if (last_end == (u64)-1)
				goto out;
			start = last_end + 1;
1062
1063
1064
1065
			state = next_state(state);
			if (start < end && state && state->start == start &&
			    !need_resched())
				goto hit_next;
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
		}
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *     | state | or               | state |
	 *
	 * There's a hole, we need to insert something in it and
	 * ignore the extent we found.
	 */
	if (state->start > start) {
		u64 this_end;
		if (end < last_start)
			this_end = end;
		else
1081
			this_end = last_start - 1;
1082
1083
1084

		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
1085