ctree.c 142 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Chris Mason's avatar
Chris Mason committed
2
/*
Chris Mason's avatar
Chris Mason committed
3
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
Chris Mason's avatar
Chris Mason committed
4
5
 */

6
#include <linux/sched.h>
7
#include <linux/slab.h>
8
#include <linux/rbtree.h>
9
#include <linux/mm.h>
10
11
#include "ctree.h"
#include "disk-io.h"
12
#include "transaction.h"
13
#include "print-tree.h"
14
#include "locking.h"
15
#include "volumes.h"
16
#include "qgroup.h"
17

18
19
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
20
21
22
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		      const struct btrfs_key *ins_key, struct btrfs_path *path,
		      int data_size, int extend);
23
static int push_node_left(struct btrfs_trans_handle *trans,
24
			  struct extent_buffer *dst,
25
			  struct extent_buffer *src, int empty);
26
27
28
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
29
30
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
		    int level, int slot);
31

32
33
34
static const struct btrfs_csums {
	u16		size;
	const char	*name;
35
	const char	*driver;
36
37
} btrfs_csums[] = {
	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38
	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39
	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40
41
	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
				     .driver = "blake2b-256" },
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
};

int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
	u16 t = btrfs_super_csum_type(s);
	/*
	 * csum type is validated at mount time
	 */
	return btrfs_csums[t].size;
}

const char *btrfs_super_csum_name(u16 csum_type)
{
	/* csum type is validated at mount time */
	return btrfs_csums[csum_type].name;
}

59
60
61
62
63
64
65
66
67
68
69
/*
 * Return driver name if defined, otherwise the name that's also a valid driver
 * name
 */
const char *btrfs_super_csum_driver(u16 csum_type)
{
	/* csum type is validated at mount time */
	return btrfs_csums[csum_type].driver ?:
		btrfs_csums[csum_type].name;
}

70
71
72
73
74
size_t __const btrfs_get_num_csums(void)
{
	return ARRAY_SIZE(btrfs_csums);
}

75
struct btrfs_path *btrfs_alloc_path(void)
Chris Mason's avatar
Chris Mason committed
76
{
77
	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
Chris Mason's avatar
Chris Mason committed
78
79
}

Chris Mason's avatar
Chris Mason committed
80
/* this also releases the path */
81
void btrfs_free_path(struct btrfs_path *p)
82
{
83
84
	if (!p)
		return;
85
	btrfs_release_path(p);
86
	kmem_cache_free(btrfs_path_cachep, p);
87
88
}

Chris Mason's avatar
Chris Mason committed
89
90
91
92
93
94
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
95
noinline void btrfs_release_path(struct btrfs_path *p)
96
97
{
	int i;
98

99
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
100
		p->slots[i] = 0;
101
		if (!p->nodes[i])
102
103
			continue;
		if (p->locks[i]) {
104
			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
105
106
			p->locks[i] = 0;
		}
107
		free_extent_buffer(p->nodes[i]);
108
		p->nodes[i] = NULL;
109
110
111
	}
}

Chris Mason's avatar
Chris Mason committed
112
113
114
115
116
117
118
119
120
121
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
122
123
124
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
125

126
127
128
129
130
131
	while (1) {
		rcu_read_lock();
		eb = rcu_dereference(root->node);

		/*
		 * RCU really hurts here, we could free up the root node because
132
		 * it was COWed but we may not get the new root node yet so do
133
134
135
136
137
138
139
140
141
142
		 * the inc_not_zero dance and if it doesn't work then
		 * synchronize_rcu and try again.
		 */
		if (atomic_inc_not_zero(&eb->refs)) {
			rcu_read_unlock();
			break;
		}
		rcu_read_unlock();
		synchronize_rcu();
	}
143
144
145
	return eb;
}

Chris Mason's avatar
Chris Mason committed
146
147
148
149
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
150
151
152
153
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

154
	while (1) {
155
156
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);
157
		if (eb == root->node)
158
159
160
161
162
163
164
			break;
		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

165
166
167
168
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
169
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
170
171
172
173
174
175
176
177
178
179
180
181
182
183
{
	struct extent_buffer *eb;

	while (1) {
		eb = btrfs_root_node(root);
		btrfs_tree_read_lock(eb);
		if (eb == root->node)
			break;
		btrfs_tree_read_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

Chris Mason's avatar
Chris Mason committed
184
185
186
187
/* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
 */
188
189
static void add_root_to_dirty_list(struct btrfs_root *root)
{
190
191
	struct btrfs_fs_info *fs_info = root->fs_info;

192
193
194
195
	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
		return;

196
	spin_lock(&fs_info->trans_lock);
197
198
	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
		/* Want the extent tree to be the last on the list */
199
		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
200
			list_move_tail(&root->dirty_list,
201
				       &fs_info->dirty_cowonly_roots);
202
203
		else
			list_move(&root->dirty_list,
204
				  &fs_info->dirty_cowonly_roots);
205
	}
206
	spin_unlock(&fs_info->trans_lock);
207
208
}

Chris Mason's avatar
Chris Mason committed
209
210
211
212
213
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
214
215
216
217
218
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
219
	struct btrfs_fs_info *fs_info = root->fs_info;
220
221
222
	struct extent_buffer *cow;
	int ret = 0;
	int level;
223
	struct btrfs_disk_key disk_key;
224

225
	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
226
		trans->transid != fs_info->running_transaction->transid);
227
228
	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
		trans->transid != root->last_trans);
229
230

	level = btrfs_header_level(buf);
231
232
233
234
	if (level == 0)
		btrfs_item_key(buf, &disk_key, 0);
	else
		btrfs_node_key(buf, &disk_key, 0);
Zheng Yan's avatar
Zheng Yan committed
235

236
237
	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
			&disk_key, level, buf->start, 0);
238
	if (IS_ERR(cow))
239
240
		return PTR_ERR(cow);

241
	copy_extent_buffer_full(cow, buf);
242
243
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
244
245
246
247
248
249
250
	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
				     BTRFS_HEADER_FLAG_RELOC);
	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
	else
		btrfs_set_header_owner(cow, new_root_objectid);
251

252
	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
Yan Zheng's avatar
Yan Zheng committed
253

254
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
255
	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
256
		ret = btrfs_inc_ref(trans, root, cow, 1);
257
	else
258
		ret = btrfs_inc_ref(trans, root, cow, 0);
259

260
261
262
263
264
265
266
267
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
enum mod_log_op {
	MOD_LOG_KEY_REPLACE,
	MOD_LOG_KEY_ADD,
	MOD_LOG_KEY_REMOVE,
	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
	MOD_LOG_MOVE_KEYS,
	MOD_LOG_ROOT_REPLACE,
};

struct tree_mod_root {
	u64 logical;
	u8 level;
};

struct tree_mod_elem {
	struct rb_node node;
285
	u64 logical;
286
	u64 seq;
287
288
289
290
291
292
293
294
295
296
297
298
299
	enum mod_log_op op;

	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
	int slot;

	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
	u64 generation;

	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
	struct btrfs_disk_key key;
	u64 blockptr;

	/* this is used for op == MOD_LOG_MOVE_KEYS */
300
301
302
303
	struct {
		int dst_slot;
		int nr_items;
	} move;
304
305
306
307
308

	/* this is used for op == MOD_LOG_ROOT_REPLACE */
	struct tree_mod_root old_root;
};

309
/*
Josef Bacik's avatar
Josef Bacik committed
310
 * Pull a new tree mod seq number for our operation.
311
 */
Josef Bacik's avatar
Josef Bacik committed
312
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
313
314
315
316
{
	return atomic64_inc_return(&fs_info->tree_mod_seq);
}

317
318
319
320
321
322
323
324
325
326
/*
 * This adds a new blocker to the tree mod log's blocker list if the @elem
 * passed does not already have a sequence number set. So when a caller expects
 * to record tree modifications, it should ensure to set elem->seq to zero
 * before calling btrfs_get_tree_mod_seq.
 * Returns a fresh, unused tree log modification sequence number, even if no new
 * blocker was added.
 */
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
			   struct seq_list *elem)
327
{
328
	write_lock(&fs_info->tree_mod_log_lock);
329
	spin_lock(&fs_info->tree_mod_seq_lock);
330
	if (!elem->seq) {
Josef Bacik's avatar
Josef Bacik committed
331
		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
332
333
		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
	}
334
	spin_unlock(&fs_info->tree_mod_seq_lock);
335
	write_unlock(&fs_info->tree_mod_log_lock);
336

Josef Bacik's avatar
Josef Bacik committed
337
	return elem->seq;
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
}

void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
			    struct seq_list *elem)
{
	struct rb_root *tm_root;
	struct rb_node *node;
	struct rb_node *next;
	struct seq_list *cur_elem;
	struct tree_mod_elem *tm;
	u64 min_seq = (u64)-1;
	u64 seq_putting = elem->seq;

	if (!seq_putting)
		return;

	spin_lock(&fs_info->tree_mod_seq_lock);
	list_del(&elem->list);
356
	elem->seq = 0;
357
358

	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
359
		if (cur_elem->seq < min_seq) {
360
361
362
363
364
			if (seq_putting > cur_elem->seq) {
				/*
				 * blocker with lower sequence number exists, we
				 * cannot remove anything from the log
				 */
365
366
				spin_unlock(&fs_info->tree_mod_seq_lock);
				return;
367
368
369
370
			}
			min_seq = cur_elem->seq;
		}
	}
371
372
	spin_unlock(&fs_info->tree_mod_seq_lock);

373
374
375
376
	/*
	 * anything that's lower than the lowest existing (read: blocked)
	 * sequence number can be removed from the tree.
	 */
377
	write_lock(&fs_info->tree_mod_log_lock);
378
379
380
	tm_root = &fs_info->tree_mod_log;
	for (node = rb_first(tm_root); node; node = next) {
		next = rb_next(node);
381
		tm = rb_entry(node, struct tree_mod_elem, node);
382
		if (tm->seq >= min_seq)
383
384
385
386
			continue;
		rb_erase(node, tm_root);
		kfree(tm);
	}
387
	write_unlock(&fs_info->tree_mod_log_lock);
388
389
390
391
}

/*
 * key order of the log:
392
 *       node/leaf start address -> sequence
393
 *
394
395
396
 * The 'start address' is the logical address of the *new* root node
 * for root replace operations, or the logical address of the affected
 * block for all other operations.
397
398
399
400
401
402
403
404
 */
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
{
	struct rb_root *tm_root;
	struct rb_node **new;
	struct rb_node *parent = NULL;
	struct tree_mod_elem *cur;
405

406
407
	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);

Josef Bacik's avatar
Josef Bacik committed
408
	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
409
410
411
412

	tm_root = &fs_info->tree_mod_log;
	new = &tm_root->rb_node;
	while (*new) {
413
		cur = rb_entry(*new, struct tree_mod_elem, node);
414
		parent = *new;
415
		if (cur->logical < tm->logical)
416
			new = &((*new)->rb_left);
417
		else if (cur->logical > tm->logical)
418
			new = &((*new)->rb_right);
419
		else if (cur->seq < tm->seq)
420
			new = &((*new)->rb_left);
421
		else if (cur->seq > tm->seq)
422
			new = &((*new)->rb_right);
423
424
		else
			return -EEXIST;
425
426
427
428
	}

	rb_link_node(&tm->node, parent, new);
	rb_insert_color(&tm->node, tm_root);
429
	return 0;
430
431
}

432
433
434
435
/*
 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 * this until all tree mod log insertions are recorded in the rb tree and then
436
 * write unlock fs_info::tree_mod_log_lock.
437
 */
438
439
440
441
442
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
				    struct extent_buffer *eb) {
	smp_mb();
	if (list_empty(&(fs_info)->tree_mod_seq_list))
		return 1;
443
444
	if (eb && btrfs_header_level(eb) == 0)
		return 1;
445

446
	write_lock(&fs_info->tree_mod_log_lock);
447
	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
448
		write_unlock(&fs_info->tree_mod_log_lock);
449
450
451
		return 1;
	}

452
453
454
	return 0;
}

455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
				    struct extent_buffer *eb)
{
	smp_mb();
	if (list_empty(&(fs_info)->tree_mod_seq_list))
		return 0;
	if (eb && btrfs_header_level(eb) == 0)
		return 0;

	return 1;
}

static struct tree_mod_elem *
alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
		    enum mod_log_op op, gfp_t flags)
471
{
472
	struct tree_mod_elem *tm;
473

474
475
	tm = kzalloc(sizeof(*tm), flags);
	if (!tm)
476
		return NULL;
477

478
	tm->logical = eb->start;
479
480
481
482
483
484
485
	if (op != MOD_LOG_KEY_ADD) {
		btrfs_node_key(eb, &tm->key, slot);
		tm->blockptr = btrfs_node_blockptr(eb, slot);
	}
	tm->op = op;
	tm->slot = slot;
	tm->generation = btrfs_node_ptr_generation(eb, slot);
486
	RB_CLEAR_NODE(&tm->node);
487

488
	return tm;
489
490
}

491
492
static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
		enum mod_log_op op, gfp_t flags)
493
{
494
495
496
	struct tree_mod_elem *tm;
	int ret;

497
	if (!tree_mod_need_log(eb->fs_info, eb))
498
499
500
501
502
503
		return 0;

	tm = alloc_tree_mod_elem(eb, slot, op, flags);
	if (!tm)
		return -ENOMEM;

504
	if (tree_mod_dont_log(eb->fs_info, eb)) {
505
		kfree(tm);
506
		return 0;
507
508
	}

509
	ret = __tree_mod_log_insert(eb->fs_info, tm);
510
	write_unlock(&eb->fs_info->tree_mod_log_lock);
511
512
	if (ret)
		kfree(tm);
513

514
	return ret;
515
516
}

517
518
static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
		int dst_slot, int src_slot, int nr_items)
519
{
520
521
522
	struct tree_mod_elem *tm = NULL;
	struct tree_mod_elem **tm_list = NULL;
	int ret = 0;
523
	int i;
524
	int locked = 0;
525

526
	if (!tree_mod_need_log(eb->fs_info, eb))
Jan Schmidt's avatar
Jan Schmidt committed
527
		return 0;
528

529
	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
530
531
532
	if (!tm_list)
		return -ENOMEM;

533
	tm = kzalloc(sizeof(*tm), GFP_NOFS);
534
535
536
537
538
	if (!tm) {
		ret = -ENOMEM;
		goto free_tms;
	}

539
	tm->logical = eb->start;
540
541
542
543
544
545
546
	tm->slot = src_slot;
	tm->move.dst_slot = dst_slot;
	tm->move.nr_items = nr_items;
	tm->op = MOD_LOG_MOVE_KEYS;

	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
547
		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
548
549
550
551
552
553
		if (!tm_list[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

554
	if (tree_mod_dont_log(eb->fs_info, eb))
555
556
557
		goto free_tms;
	locked = 1;

558
559
560
561
562
	/*
	 * When we override something during the move, we log these removals.
	 * This can only happen when we move towards the beginning of the
	 * buffer, i.e. dst_slot < src_slot.
	 */
563
	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
564
		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
565
566
		if (ret)
			goto free_tms;
567
568
	}

569
	ret = __tree_mod_log_insert(eb->fs_info, tm);
570
571
	if (ret)
		goto free_tms;
572
	write_unlock(&eb->fs_info->tree_mod_log_lock);
573
	kfree(tm_list);
Jan Schmidt's avatar
Jan Schmidt committed
574

575
576
577
578
	return 0;
free_tms:
	for (i = 0; i < nr_items; i++) {
		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
579
			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
580
581
582
		kfree(tm_list[i]);
	}
	if (locked)
583
		write_unlock(&eb->fs_info->tree_mod_log_lock);
584
585
	kfree(tm_list);
	kfree(tm);
586

587
	return ret;
588
589
}

590
591
592
593
static inline int
__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
		       struct tree_mod_elem **tm_list,
		       int nritems)
594
{
595
	int i, j;
596
597
598
	int ret;

	for (i = nritems - 1; i >= 0; i--) {
599
600
601
602
603
604
605
		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
		if (ret) {
			for (j = nritems - 1; j > i; j--)
				rb_erase(&tm_list[j]->node,
					 &fs_info->tree_mod_log);
			return ret;
		}
606
	}
607
608

	return 0;
609
610
}

611
612
static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
			 struct extent_buffer *new_root, int log_removal)
613
{
614
	struct btrfs_fs_info *fs_info = old_root->fs_info;
615
616
617
618
619
	struct tree_mod_elem *tm = NULL;
	struct tree_mod_elem **tm_list = NULL;
	int nritems = 0;
	int ret = 0;
	int i;
620

621
	if (!tree_mod_need_log(fs_info, NULL))
622
623
		return 0;

624
625
	if (log_removal && btrfs_header_level(old_root) > 0) {
		nritems = btrfs_header_nritems(old_root);
626
		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
627
				  GFP_NOFS);
628
629
630
631
632
633
		if (!tm_list) {
			ret = -ENOMEM;
			goto free_tms;
		}
		for (i = 0; i < nritems; i++) {
			tm_list[i] = alloc_tree_mod_elem(old_root, i,
634
			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
635
636
637
638
639
640
			if (!tm_list[i]) {
				ret = -ENOMEM;
				goto free_tms;
			}
		}
	}
641

642
	tm = kzalloc(sizeof(*tm), GFP_NOFS);
643
644
645
646
	if (!tm) {
		ret = -ENOMEM;
		goto free_tms;
	}
647

648
	tm->logical = new_root->start;
649
650
651
652
653
	tm->old_root.logical = old_root->start;
	tm->old_root.level = btrfs_header_level(old_root);
	tm->generation = btrfs_header_generation(old_root);
	tm->op = MOD_LOG_ROOT_REPLACE;

654
655
656
657
658
659
660
661
	if (tree_mod_dont_log(fs_info, NULL))
		goto free_tms;

	if (tm_list)
		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
	if (!ret)
		ret = __tree_mod_log_insert(fs_info, tm);

662
	write_unlock(&fs_info->tree_mod_log_lock);
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
	if (ret)
		goto free_tms;
	kfree(tm_list);

	return ret;

free_tms:
	if (tm_list) {
		for (i = 0; i < nritems; i++)
			kfree(tm_list[i]);
		kfree(tm_list);
	}
	kfree(tm);

	return ret;
678
679
680
681
682
683
684
685
686
687
688
}

static struct tree_mod_elem *
__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
		      int smallest)
{
	struct rb_root *tm_root;
	struct rb_node *node;
	struct tree_mod_elem *cur = NULL;
	struct tree_mod_elem *found = NULL;

689
	read_lock(&fs_info->tree_mod_log_lock);
690
691
692
	tm_root = &fs_info->tree_mod_log;
	node = tm_root->rb_node;
	while (node) {
693
		cur = rb_entry(node, struct tree_mod_elem, node);
694
		if (cur->logical < start) {
695
			node = node->rb_left;
696
		} else if (cur->logical > start) {
697
			node = node->rb_right;
698
		} else if (cur->seq < min_seq) {
699
700
701
702
			node = node->rb_left;
		} else if (!smallest) {
			/* we want the node with the highest seq */
			if (found)
703
				BUG_ON(found->seq > cur->seq);
704
705
			found = cur;
			node = node->rb_left;
706
		} else if (cur->seq > min_seq) {
707
708
			/* we want the node with the smallest seq */
			if (found)
709
				BUG_ON(found->seq < cur->seq);
710
711
712
713
714
715
716
			found = cur;
			node = node->rb_right;
		} else {
			found = cur;
			break;
		}
	}
717
	read_unlock(&fs_info->tree_mod_log_lock);
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744

	return found;
}

/*
 * this returns the element from the log with the smallest time sequence
 * value that's in the log (the oldest log item). any element with a time
 * sequence lower than min_seq will be ignored.
 */
static struct tree_mod_elem *
tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
			   u64 min_seq)
{
	return __tree_mod_log_search(fs_info, start, min_seq, 1);
}

/*
 * this returns the element from the log with the largest time sequence
 * value that's in the log (the most recent log item). any element with
 * a time sequence lower than min_seq will be ignored.
 */
static struct tree_mod_elem *
tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
{
	return __tree_mod_log_search(fs_info, start, min_seq, 0);
}

745
static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
746
		     struct extent_buffer *src, unsigned long dst_offset,
747
		     unsigned long src_offset, int nr_items)
748
{
749
	struct btrfs_fs_info *fs_info = dst->fs_info;
750
751
752
	int ret = 0;
	struct tree_mod_elem **tm_list = NULL;
	struct tree_mod_elem **tm_list_add, **tm_list_rem;
753
	int i;
754
	int locked = 0;
755

756
757
	if (!tree_mod_need_log(fs_info, NULL))
		return 0;
758

759
	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
760
761
		return 0;

762
	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
763
764
765
			  GFP_NOFS);
	if (!tm_list)
		return -ENOMEM;
766

767
768
	tm_list_add = tm_list;
	tm_list_rem = tm_list + nr_items;
769
	for (i = 0; i < nr_items; i++) {
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
		if (!tm_list_rem[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}

		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
		    MOD_LOG_KEY_ADD, GFP_NOFS);
		if (!tm_list_add[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

	if (tree_mod_dont_log(fs_info, NULL))
		goto free_tms;
	locked = 1;

	for (i = 0; i < nr_items; i++) {
		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
		if (ret)
			goto free_tms;
		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
		if (ret)
			goto free_tms;
796
	}
797

798
	write_unlock(&fs_info->tree_mod_log_lock);
799
800
801
802
803
804
805
806
807
808
809
	kfree(tm_list);

	return 0;

free_tms:
	for (i = 0; i < nr_items * 2; i++) {
		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
		kfree(tm_list[i]);
	}
	if (locked)
810
		write_unlock(&fs_info->tree_mod_log_lock);
811
812
813
	kfree(tm_list);

	return ret;
814
815
}

816
static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
817
{
818
819
820
821
822
823
824
825
	struct tree_mod_elem **tm_list = NULL;
	int nritems = 0;
	int i;
	int ret = 0;

	if (btrfs_header_level(eb) == 0)
		return 0;

826
	if (!tree_mod_need_log(eb->fs_info, NULL))
827
828
829
		return 0;

	nritems = btrfs_header_nritems(eb);
830
	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
831
832
833
834
835
836
837
838
839
840
841
842
	if (!tm_list)
		return -ENOMEM;

	for (i = 0; i < nritems; i++) {
		tm_list[i] = alloc_tree_mod_elem(eb, i,
		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
		if (!tm_list[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

843
	if (tree_mod_dont_log(eb->fs_info, eb))
844
845
		goto free_tms;

846
	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
847
	write_unlock(&eb->fs_info->tree_mod_log_lock);
848
849
850
851
852
853
854
855
856
857
858
859
	if (ret)
		goto free_tms;
	kfree(tm_list);

	return 0;

free_tms:
	for (i = 0; i < nritems; i++)
		kfree(tm_list[i]);
	kfree(tm_list);

	return ret;
860
861
}

862
863
864
865
866
867
868
/*
 * check if the tree block can be shared by multiple trees
 */
int btrfs_block_can_be_shared(struct btrfs_root *root,
			      struct extent_buffer *buf)
{
	/*
869
	 * Tree blocks not in reference counted trees and tree roots
870
871
872
873
	 * are never shared. If a block was allocated after the last
	 * snapshot and the block was not allocated by tree relocation,
	 * we know the block is not shared.
	 */
874
	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
875
876
877
878
879
	    buf != root->node && buf != root->commit_root &&
	    (btrfs_header_generation(buf) <=
	     btrfs_root_last_snapshot(&root->root_item) ||
	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
		return 1;
880

881
882
883
884
885
886
	return 0;
}

static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
				       struct btrfs_root *root,
				       struct extent_buffer *buf,
887
888
				       struct extent_buffer *cow,
				       int *last_ref)
889
{
890
	struct btrfs_fs_info *fs_info = root->fs_info;
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
	u64 refs;
	u64 owner;
	u64 flags;
	u64 new_flags = 0;
	int ret;

	/*
	 * Backrefs update rules:
	 *
	 * Always use full backrefs for extent pointers in tree block
	 * allocated by tree relocation.
	 *
	 * If a shared tree block is no longer referenced by its owner
	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
	 * use full backrefs for extent pointers in tree block.
	 *
	 * If a tree block is been relocating
	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
	 * use full backrefs for extent pointers in tree block.
	 * The reason for this is some operations (such as drop tree)
	 * are only allowed for blocks use full backrefs.
	 */

	if (btrfs_block_can_be_shared(root, buf)) {
915
		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
916
917
					       btrfs_header_level(buf), 1,
					       &refs, &flags);
918
919
		if (ret)
			return ret;
920
921
		if (refs == 0) {
			ret = -EROFS;
922
			btrfs_handle_fs_error(fs_info, ret, NULL);
923
924
			return ret;
		}
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
	} else {
		refs = 1;
		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
		else
			flags = 0;
	}

	owner = btrfs_header_owner(buf);
	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));

	if (refs > 1) {
		if ((owner == root->root_key.objectid ||
		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
942
			ret = btrfs_inc_ref(trans, root, buf, 1);
943
944
			if (ret)
				return ret;
945
946
947

			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
948
				ret = btrfs_dec_ref(trans, root, buf, 0);
949
950
				if (ret)
					return ret;
951
				ret = btrfs_inc_ref(trans, root, cow, 1);
952
953
				if (ret)
					return ret;
954
955
956
957
958
959
			}
			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
		} else {

			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID)
960
				ret = btrfs_inc_ref(trans, root, cow, 1);
961
			else
962
				ret = btrfs_inc_ref(trans, root, cow, 0);
963
964
			if (ret)
				return ret;
965
966
		}
		if (new_flags != 0) {
967
968
			int level = btrfs_header_level(buf);

969
			ret = btrfs_set_disk_extent_flags(trans,
970
971
							  buf->start,
							  buf->len,
972
							  new_flags, level, 0);
973
974
			if (ret)
				return ret;
975
976
977
978
979
		}
	} else {
		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID)
980
				ret = btrfs_inc_ref(trans, root, cow, 1);
981
			else
982
				ret = btrfs_inc_ref(trans, root, cow, 0);
983
984
			if (ret)
				return ret;
985
			ret = btrfs_dec_ref(trans, root, buf, 1);
986
987
			if (ret)
				return ret;
988
		}
989
		btrfs_clean_tree_block(buf);
990
		*last_ref = 1;
991
992
993
994
	}
	return 0;
}

995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
static struct extent_buffer *alloc_tree_block_no_bg_flush(
					  struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  u64 parent_start,
					  const struct btrfs_disk_key *disk_key,
					  int level,
					  u64 hint,
					  u64 empty_size)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_buffer *ret;

	/*
	 * If we are COWing a node/leaf from the extent, chunk, device or free
	 * space trees, make sure that we do not finish block group creation of
	 * pending block groups. We do this to avoid a deadlock.
	 * COWing can result in allocation of a new chunk, and flushing pending
	 * block groups (btrfs_create_pending_block_groups()) can be triggered
	 * when finishing allocation of a new chunk. Creation of a pending block
	 * group modifies the extent, chunk, device and free space trees,
	 * therefore we could deadlock with ourselves since we are holding a
	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
	 * try to COW later.
	 * For similar reasons, we also need to delay flushing pending block
	 * groups when splitting a leaf or node, from one of those trees, since
	 * we are holding a write lock on it and its parent or when inserting a
	 * new root node for one of those trees.
	 */
	if (root == fs_info->extent_root ||
	    root == fs_info->chunk_root ||
	    root == fs_info->dev_root ||
	    root == fs_info->free_space_root)
		trans->can_flush_pending_bgs = false;

	ret = btrfs_alloc_tree_block(trans, root, parent_start,
				     root->root_key.objectid, disk_key, level,
				     hint, empty_size);
	trans->can_flush_pending_bgs = true;

	return ret;
}

Chris Mason's avatar
Chris Mason committed
1037
/*
1038
1039
1040
1041
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
Chris Mason's avatar
Chris Mason committed
1042
1043
1044
 *
 * search_start -- an allocation hint for the new block
 *
1045
1046
1047
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
Chris Mason's avatar
Chris Mason committed
1048
 */
1049
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1050
1051
1052
1053
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
1054
			     u64 search_start, u64 empty_size)
Chris Mason's avatar
Chris Mason committed
1055
{
1056
	struct btrfs_fs_info *fs_info = root->fs_info;
1057
	struct btrfs_disk_key disk_key;
1058
	struct extent_buffer *cow;
1059
	int level, ret;
1060
	int last_ref = 0;
1061
	int unlock_orig = 0;
1062
	u64 parent_start = 0;
1063

1064
1065
1066
	if (*cow_ret == buf)
		unlock_orig = 1;

1067
	btrfs_assert_tree_locked(buf);
1068

1069
	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1070
		trans->transid != fs_info->running_transaction->transid);
1071
1072
	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
		trans->transid != root->last_trans);
1073

1074
	level = btrfs_header_level(buf);
Zheng Yan's avatar
Zheng Yan committed
1075

1076
1077
1078
1079
1080
	if (level == 0)
		btrfs_item_key(buf, &disk_key, 0);
	else
		btrfs_node_key(buf, &disk_key, 0);