ctree.h 116 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
Chris Mason's avatar
Chris Mason committed
2
3
4
5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

6
7
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
8

9
#include <linux/mm.h>
10
#include <linux/sched/signal.h>
11
#include <linux/highmem.h>
Chris Mason's avatar
Chris Mason committed
12
#include <linux/fs.h>
Arne Jansen's avatar
Arne Jansen committed
13
#include <linux/rwsem.h>
14
#include <linux/semaphore.h>
15
#include <linux/completion.h>
Chris Mason's avatar
Chris Mason committed
16
#include <linux/backing-dev.h>
17
#include <linux/wait.h>
18
#include <linux/slab.h>
19
#include <trace/events/btrfs.h>
20
#include <asm/kmap_types.h>
21
#include <asm/unaligned.h>
22
#include <linux/pagemap.h>
23
#include <linux/btrfs.h>
24
#include <linux/btrfs_tree.h>
25
#include <linux/workqueue.h>
26
#include <linux/security.h>
27
#include <linux/sizes.h>
28
#include <linux/dynamic_debug.h>
29
#include <linux/refcount.h>
30
#include <linux/crc32c.h>
31
#include "extent-io-tree.h"
32
#include "extent_io.h"
33
#include "extent_map.h"
34
#include "async-thread.h"
35
#include "block-rsv.h"
Chris Mason's avatar
Chris Mason committed
36

37
struct btrfs_trans_handle;
Chris Mason's avatar
Chris Mason committed
38
struct btrfs_transaction;
39
struct btrfs_pending_snapshot;
40
struct btrfs_delayed_ref_root;
41
struct btrfs_space_info;
42
struct btrfs_block_group;
43
44
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
Chris Mason's avatar
Chris Mason committed
45
extern struct kmem_cache *btrfs_path_cachep;
46
extern struct kmem_cache *btrfs_free_space_cachep;
47
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
48
struct btrfs_ordered_sum;
49
struct btrfs_ref;
50

51
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
52

53
54
55
56
57
58
59
/*
 * Maximum number of mirrors that can be available for all profiles counting
 * the target device of dev-replace as one. During an active device replace
 * procedure, the target device of the copy operation is a mirror for the
 * filesystem data as well that can be used to read data in order to repair
 * read errors on other disks.
 *
60
 * Current value is derived from RAID1C4 with 4 copies.
61
 */
62
#define BTRFS_MAX_MIRRORS (4 + 1)
63

64
#define BTRFS_MAX_LEVEL 8
65

66
67
#define BTRFS_OLDEST_GENERATION	0ULL

68
69
70
71
72
73
/*
 * the max metadata block size.  This limit is somewhat artificial,
 * but the memmove costs go through the roof for larger blocks.
 */
#define BTRFS_MAX_METADATA_BLOCKSIZE 65536

Chris Mason's avatar
Chris Mason committed
74
75
76
77
78
79
/*
 * we can actually store much bigger names, but lets not confuse the rest
 * of linux
 */
#define BTRFS_NAME_LEN 255

Mark Fasheh's avatar
Mark Fasheh committed
80
81
82
83
84
85
86
/*
 * Theoretical limit is larger, but we keep this down to a sane
 * value. That should limit greatly the possibility of collisions on
 * inode ref items.
 */
#define BTRFS_LINK_MAX 65535U

87
#define BTRFS_EMPTY_DIR_SIZE 0
Chris Mason's avatar
Chris Mason committed
88

89
90
91
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))

92
#define BTRFS_DIRTY_METADATA_THRESH	SZ_32M
93

94
95
96
97
98
99
100
101
/*
 * Use large batch size to reduce overhead of metadata updates.  On the reader
 * side, we only read it when we are close to ENOSPC and the read overhead is
 * mostly related to the number of CPUs, so it is OK to use arbitrary large
 * value here.
 */
#define BTRFS_TOTAL_BYTES_PINNED_BATCH	SZ_128M

102
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
103

104

105
106
107
108
109
110
111
112
/*
 * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
 */
static inline u32 count_max_extents(u64 size)
{
	return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
}

113
114
115
116
117
118
119
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
	BUG_ON(num_stripes == 0);
	return sizeof(struct btrfs_chunk) +
		sizeof(struct btrfs_stripe) * (num_stripes - 1);
}

120
/*
121
 * Runtime (in-memory) states of filesystem
122
 */
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
enum {
	/* Global indicator of serious filesystem errors */
	BTRFS_FS_STATE_ERROR,
	/*
	 * Filesystem is being remounted, allow to skip some operations, like
	 * defrag
	 */
	BTRFS_FS_STATE_REMOUNTING,
	/* Track if a transaction abort has been reported on this filesystem */
	BTRFS_FS_STATE_TRANS_ABORTED,
	/*
	 * Bio operations should be blocked on this filesystem because a source
	 * or target device is being destroyed as part of a device replace
	 */
	BTRFS_FS_STATE_DEV_REPLACING,
	/* The btrfs_fs_info created for self-tests */
	BTRFS_FS_STATE_DUMMY_FS_INFO,
};
141

142
143
144
145
146
147
148
#define BTRFS_BACKREF_REV_MAX		256
#define BTRFS_BACKREF_REV_SHIFT		56
#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
					 BTRFS_BACKREF_REV_SHIFT)

#define BTRFS_OLD_BACKREF_REV		0
#define BTRFS_MIXED_BACKREF_REV		1
149

150
151
152
/*
 * every tree block (leaf or node) starts with this header.
 */
153
struct btrfs_header {
154
	/* these first four must match the super block */
Chris Mason's avatar
Chris Mason committed
155
	u8 csum[BTRFS_CSUM_SIZE];
156
	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
157
	__le64 bytenr; /* which block this node is supposed to live in */
158
	__le64 flags;
159
160
161

	/* allowed to be different from the super from here on down */
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
162
	__le64 generation;
163
	__le64 owner;
164
	__le32 nritems;
165
	u8 level;
166
167
} __attribute__ ((__packed__));

168
169
170
171
172
173
/*
 * this is a very generous portion of the super block, giving us
 * room to translate 14 chunks with 3 stripes each.
 */
#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/*
 * just in case we somehow lose the roots and are not able to mount,
 * we store an array of the roots from previous transactions
 * in the super.
 */
#define BTRFS_NUM_BACKUP_ROOTS 4
struct btrfs_root_backup {
	__le64 tree_root;
	__le64 tree_root_gen;

	__le64 chunk_root;
	__le64 chunk_root_gen;

	__le64 extent_root;
	__le64 extent_root_gen;

	__le64 fs_root;
	__le64 fs_root_gen;

	__le64 dev_root;
	__le64 dev_root_gen;

	__le64 csum_root;
	__le64 csum_root_gen;

	__le64 total_bytes;
	__le64 bytes_used;
	__le64 num_devices;
	/* future */
Masanari Iida's avatar
Masanari Iida committed
203
	__le64 unused_64[4];
204
205
206
207
208
209
210
211
212
213
214

	u8 tree_root_level;
	u8 chunk_root_level;
	u8 extent_root_level;
	u8 fs_root_level;
	u8 dev_root_level;
	u8 csum_root_level;
	/* future and to align */
	u8 unused_8[10];
} __attribute__ ((__packed__));

215
216
217
218
/*
 * the super block basically lists the main trees of the FS
 * it currently lacks any block count etc etc
 */
219
struct btrfs_super_block {
220
	/* the first 4 fields must match struct btrfs_header */
221
222
223
	u8 csum[BTRFS_CSUM_SIZE];
	/* FS specific UUID, visible to user */
	u8 fsid[BTRFS_FSID_SIZE];
224
	__le64 bytenr; /* this block number */
225
	__le64 flags;
226
227

	/* allowed to be different from the btrfs_header from here own down */
228
229
230
	__le64 magic;
	__le64 generation;
	__le64 root;
231
	__le64 chunk_root;
232
	__le64 log_root;
233
234
235

	/* this will help find the new super based on the log root */
	__le64 log_root_transid;
236
237
	__le64 total_bytes;
	__le64 bytes_used;
238
	__le64 root_dir_objectid;
239
	__le64 num_devices;
240
241
	__le32 sectorsize;
	__le32 nodesize;
242
	__le32 __unused_leafsize;
243
	__le32 stripesize;
244
	__le32 sys_chunk_array_size;
245
	__le64 chunk_root_generation;
246
247
248
	__le64 compat_flags;
	__le64 compat_ro_flags;
	__le64 incompat_flags;
249
	__le16 csum_type;
250
	u8 root_level;
251
	u8 chunk_root_level;
252
	u8 log_root_level;
253
	struct btrfs_dev_item dev_item;
254

255
	char label[BTRFS_LABEL_SIZE];
256

257
	__le64 cache_generation;
258
	__le64 uuid_tree_generation;
259

260
261
262
	/* the UUID written into btree blocks */
	u8 metadata_uuid[BTRFS_FSID_SIZE];

263
	/* future expansion */
264
	__le64 reserved[28];
265
	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
266
	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
Chris Mason's avatar
Chris Mason committed
267
268
} __attribute__ ((__packed__));

269
270
271
272
/*
 * Compat flags that we support.  If any incompat flags are set other than the
 * ones specified below then we will fail to mount
 */
273
#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
274
275
#define BTRFS_FEATURE_COMPAT_SAFE_SET		0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
276
277

#define BTRFS_FEATURE_COMPAT_RO_SUPP			\
278
279
	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
280

281
282
283
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL

284
285
#define BTRFS_FEATURE_INCOMPAT_SUPP			\
	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
286
	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
Li Zefan's avatar
Li Zefan committed
287
	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
288
	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
Mark Fasheh's avatar
Mark Fasheh committed
289
	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
Nick Terrell's avatar
Nick Terrell committed
290
	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
David Woodhouse's avatar
David Woodhouse committed
291
	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
292
	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
293
	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
294
	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
295
296
	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
	 BTRFS_FEATURE_INCOMPAT_RAID1C34)
297

298
299
300
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR		0ULL
301

302
/*
303
 * A leaf is full of items. offset and size tell us where to find
304
305
 * the item in the leaf (relative to the start of the data area)
 */
Chris Mason's avatar
Chris Mason committed
306
struct btrfs_item {
Chris Mason's avatar
Chris Mason committed
307
	struct btrfs_disk_key key;
308
	__le32 offset;
309
	__le32 size;
310
311
} __attribute__ ((__packed__));

312
313
314
315
316
317
318
/*
 * leaves have an item area and a data area:
 * [item0, item1....itemN] [free space] [dataN...data1, data0]
 *
 * The data is separate from the items to get the keys closer together
 * during searches.
 */
319
struct btrfs_leaf {
320
	struct btrfs_header header;
321
	struct btrfs_item items[];
322
323
} __attribute__ ((__packed__));

324
325
326
327
/*
 * all non-leaf blocks are nodes, they hold only keys and pointers to
 * other blocks
 */
328
329
330
struct btrfs_key_ptr {
	struct btrfs_disk_key key;
	__le64 blockptr;
331
	__le64 generation;
332
333
} __attribute__ ((__packed__));

334
struct btrfs_node {
335
	struct btrfs_header header;
336
	struct btrfs_key_ptr ptrs[];
337
338
} __attribute__ ((__packed__));

339
/*
340
341
 * btrfs_paths remember the path taken from the root down to the leaf.
 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
342
343
344
345
346
 * to any other levels that are present.
 *
 * The slots array records the index of the item or block pointer
 * used while walking the tree.
 */
347
enum { READA_NONE, READA_BACK, READA_FORWARD };
348
struct btrfs_path {
349
	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
350
	int slots[BTRFS_MAX_LEVEL];
351
	/* if there is real range locking, this locks field will change */
352
	u8 locks[BTRFS_MAX_LEVEL];
353
	u8 reada;
354
	/* keep some upper locks as we walk down */
355
	u8 lowest_level;
356
357
358
359
360

	/*
	 * set by btrfs_split_item, tells search_slot to keep all locks
	 * and to force calls to keep space in the nodes
	 */
361
362
363
364
	unsigned int search_for_split:1;
	unsigned int keep_locks:1;
	unsigned int skip_locking:1;
	unsigned int leave_spinning:1;
365
	unsigned int search_commit_root:1;
366
	unsigned int need_commit_sem:1;
367
	unsigned int skip_release_on_error:1;
368
};
369
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
370
					sizeof(struct btrfs_item))
371
372
struct btrfs_dev_replace {
	u64 replace_state;	/* see #define above */
373
374
	time64_t time_started;	/* seconds since 1-Jan-1970 */
	time64_t time_stopped;	/* seconds since 1-Jan-1970 */
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
	atomic64_t num_write_errors;
	atomic64_t num_uncorrectable_read_errors;

	u64 cursor_left;
	u64 committed_cursor_left;
	u64 cursor_left_last_write_of_item;
	u64 cursor_right;

	u64 cont_reading_from_srcdev_mode;	/* see #define above */

	int is_valid;
	int item_needs_writeback;
	struct btrfs_device *srcdev;
	struct btrfs_device *tgtdev;

	struct mutex lock_finishing_cancel_unmount;
391
	struct rw_semaphore rwsem;
392
393

	struct btrfs_scrub_progress scrub_progress;
394
395
396

	struct percpu_counter bio_counter;
	wait_queue_head_t replace_wait;
397
398
};

399
400
/*
 * free clusters are used to claim free space in relatively large chunks,
401
402
 * allowing us to do less seeky writes. They are used for all metadata
 * allocations. In ssd_spread mode they are also used for data allocations.
403
404
405
406
407
408
409
410
411
412
413
414
 */
struct btrfs_free_cluster {
	spinlock_t lock;
	spinlock_t refill_lock;
	struct rb_root root;

	/* largest extent in this cluster */
	u64 max_size;

	/* first extent starting offset */
	u64 window_start;

415
416
417
	/* We did a full search and couldn't create a cluster */
	bool fragmented;

418
	struct btrfs_block_group *block_group;
419
420
421
422
423
424
	/*
	 * when a cluster is allocated from a block group, we put the
	 * cluster onto a list in the block group so that it can
	 * be freed before the block group is freed.
	 */
	struct list_head block_group_list;
425
426
};

Josef Bacik's avatar
Josef Bacik committed
427
enum btrfs_caching_type {
428
429
430
431
432
	BTRFS_CACHE_NO,
	BTRFS_CACHE_STARTED,
	BTRFS_CACHE_FAST,
	BTRFS_CACHE_FINISHED,
	BTRFS_CACHE_ERROR,
Josef Bacik's avatar
Josef Bacik committed
433
434
};

435
436
437
438
439
440
441
442
/*
 * Tree to record all locked full stripes of a RAID5/6 block group
 */
struct btrfs_full_stripe_locks_tree {
	struct rb_root root;
	struct mutex lock;
};

443
444
445
446
447
448
/* delayed seq elem */
struct seq_list {
	struct list_head list;
	u64 seq;
};

449
450
#define SEQ_LIST_INIT(name)	{ .list = LIST_HEAD_INIT((name).list), .seq = 0 }

451
452
#define SEQ_LAST	((u64)-1)

453
454
455
456
457
enum btrfs_orphan_cleanup_state {
	ORPHAN_CLEANUP_STARTED	= 1,
	ORPHAN_CLEANUP_DONE	= 2,
};

458
459
void btrfs_init_async_reclaim_work(struct work_struct *work);

460
/* fs_info */
461
struct reloc_control;
462
struct btrfs_device;
463
struct btrfs_fs_devices;
464
struct btrfs_balance_control;
465
struct btrfs_delayed_root;
466

467
468
469
470
471
472
473
474
475
476
477
478
479
480
/*
 * Block group or device which contains an active swapfile. Used for preventing
 * unsafe operations while a swapfile is active.
 *
 * These are sorted on (ptr, inode) (note that a block group or device can
 * contain more than one swapfile). We compare the pointer values because we
 * don't actually care what the object is, we just need a quick check whether
 * the object exists in the rbtree.
 */
struct btrfs_swapfile_pin {
	struct rb_node node;
	void *ptr;
	struct inode *inode;
	/*
481
482
	 * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
	 * points to a struct btrfs_device.
483
484
485
486
487
488
	 */
	bool is_block_group;
};

bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);

489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
enum {
	BTRFS_FS_BARRIER,
	BTRFS_FS_CLOSING_START,
	BTRFS_FS_CLOSING_DONE,
	BTRFS_FS_LOG_RECOVERING,
	BTRFS_FS_OPEN,
	BTRFS_FS_QUOTA_ENABLED,
	BTRFS_FS_UPDATE_UUID_TREE_GEN,
	BTRFS_FS_CREATING_FREE_SPACE_TREE,
	BTRFS_FS_BTREE_ERR,
	BTRFS_FS_LOG1_ERR,
	BTRFS_FS_LOG2_ERR,
	BTRFS_FS_QUOTA_OVERRIDE,
	/* Used to record internally whether fs has been frozen */
	BTRFS_FS_FROZEN,
	/*
	 * Indicate that a whole-filesystem exclusive operation is running
	 * (device replace, resize, device add/delete, balance)
	 */
	BTRFS_FS_EXCL_OP,
	/*
	 * To info transaction_kthread we need an immediate commit so it
	 * doesn't need to wait for commit_interval
	 */
	BTRFS_FS_NEED_ASYNC_COMMIT,
	/*
	 * Indicate that balance has been set up from the ioctl and is in the
	 * main phase. The fs_info::balance_ctl is initialized.
517
	 * Set and cleared while holding fs_info::balance_mutex.
518
519
	 */
	BTRFS_FS_BALANCE_RUNNING,
520
521
522

	/* Indicate that the cleaner thread is awake and doing something. */
	BTRFS_FS_CLEANER_RUNNING,
523
524
525
526
527
528

	/*
	 * The checksumming has an optimized version and is considered fast,
	 * so we don't need to offload checksums to workqueues.
	 */
	BTRFS_FS_CSUM_IMPL_FAST,
529
};
530

531
struct btrfs_fs_info {
532
	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
533
	unsigned long flags;
534
535
	struct btrfs_root *extent_root;
	struct btrfs_root *tree_root;
536
537
	struct btrfs_root *chunk_root;
	struct btrfs_root *dev_root;
538
	struct btrfs_root *fs_root;
539
	struct btrfs_root *csum_root;
540
	struct btrfs_root *quota_root;
541
	struct btrfs_root *uuid_root;
542
	struct btrfs_root *free_space_root;
543
544
545

	/* the log root tree is a directory of all the other log roots */
	struct btrfs_root *log_root_tree;
546
547

	spinlock_t fs_roots_radix_lock;
548
	struct radix_tree_root fs_roots_radix;
549

550
551
	/* block group cache stuff */
	spinlock_t block_group_cache_lock;
552
	u64 first_logical_byte;
553
554
	struct rb_root block_group_cache_tree;

555
	/* keep track of unallocated space */
556
	atomic64_t free_chunk_space;
557

558
559
	struct extent_io_tree freed_extents[2];
	struct extent_io_tree *pinned_extents;
560

561
	/* logical->physical extent mapping */
562
	struct extent_map_tree mapping_tree;
563

564
565
566
567
	/*
	 * block reservation for extent, checksum, root tree and
	 * delayed dir index item
	 */
568
569
570
571
572
	struct btrfs_block_rsv global_block_rsv;
	/* block reservation for metadata operations */
	struct btrfs_block_rsv trans_block_rsv;
	/* block reservation for chunk tree */
	struct btrfs_block_rsv chunk_block_rsv;
573
574
	/* block reservation for delayed operations */
	struct btrfs_block_rsv delayed_block_rsv;
575
576
	/* block reservation for delayed refs */
	struct btrfs_block_rsv delayed_refs_rsv;
577
578
579

	struct btrfs_block_rsv empty_block_rsv;

580
	u64 generation;
581
	u64 last_trans_committed;
582
	u64 avg_delayed_ref_runtime;
583
584
585
586
587
588

	/*
	 * this is updated to the current trans every time a full commit
	 * is required instead of the faster short fsync log commits
	 */
	u64 last_trans_log_full_commit;
589
	unsigned long mount_opt;
590
591
592
593
594
	/*
	 * Track requests for actions that need to be done during transaction
	 * commit (like for some mount options).
	 */
	unsigned long pending_changes;
595
	unsigned long compress_type:4;
596
	unsigned int compress_level;
597
	u32 commit_interval;
598
599
600
601
602
603
	/*
	 * It is a suggestive number, the read side is safe even it gets a
	 * wrong number because we will write out the data into a regular
	 * extent. The write side(mount/remount) is under ->s_umount lock,
	 * so it is also safe.
	 */
604
	u64 max_inline;
605

Chris Mason's avatar
Chris Mason committed
606
	struct btrfs_transaction *running_transaction;
607
	wait_queue_head_t transaction_throttle;
608
	wait_queue_head_t transaction_wait;
Sage Weil's avatar
Sage Weil committed
609
	wait_queue_head_t transaction_blocked_wait;
610
	wait_queue_head_t async_submit_wait;
611

612
613
614
615
616
617
618
619
620
621
622
	/*
	 * Used to protect the incompat_flags, compat_flags, compat_ro_flags
	 * when they are updated.
	 *
	 * Because we do not clear the flags for ever, so we needn't use
	 * the lock on the read side.
	 *
	 * We also needn't use the lock when we mount the fs, because
	 * there is no other task which will update the flag.
	 */
	spinlock_t super_lock;
623
624
	struct btrfs_super_block *super_copy;
	struct btrfs_super_block *super_for_commit;
Chris Mason's avatar
Chris Mason committed
625
	struct super_block *sb;
626
	struct inode *btree_inode;
627
	struct mutex tree_log_mutex;
628
629
	struct mutex transaction_kthread_mutex;
	struct mutex cleaner_mutex;
630
	struct mutex chunk_mutex;
David Woodhouse's avatar
David Woodhouse committed
631

632
633
634
635
636
637
	/*
	 * this is taken to make sure we don't set block groups ro after
	 * the free space cache has been allocated on them
	 */
	struct mutex ro_block_group_mutex;

David Woodhouse's avatar
David Woodhouse committed
638
639
640
641
642
643
	/* this is used during read/modify/write to make sure
	 * no two ios are trying to mod the same stripe at the same
	 * time
	 */
	struct btrfs_stripe_hash_table *stripe_hash_table;

644
645
646
647
648
649
650
651
	/*
	 * this protects the ordered operations list only while we are
	 * processing all of the entries on it.  This way we make
	 * sure the commit code doesn't find the list temporarily empty
	 * because another function happens to be doing non-waiting preflush
	 * before jumping into the main commit.
	 */
	struct mutex ordered_operations_mutex;
652

653
	struct rw_semaphore commit_root_sem;
654

655
	struct rw_semaphore cleanup_work_sem;
656

657
	struct rw_semaphore subvol_sem;
658
659
	struct srcu_struct subvol_srcu;

Josef Bacik's avatar
Josef Bacik committed
660
	spinlock_t trans_lock;
Chris Mason's avatar
Chris Mason committed
661
662
663
664
665
666
	/*
	 * the reloc mutex goes with the trans lock, it is taken
	 * during commit to protect us from the relocation code
	 */
	struct mutex reloc_mutex;

Chris Mason's avatar
Chris Mason committed
667
	struct list_head trans_list;
668
	struct list_head dead_roots;
669
	struct list_head caching_block_groups;
670

Yan, Zheng's avatar
Yan, Zheng committed
671
672
	spinlock_t delayed_iput_lock;
	struct list_head delayed_iputs;
673
674
	atomic_t nr_delayed_iputs;
	wait_queue_head_t delayed_iputs_wait;
Yan, Zheng's avatar
Yan, Zheng committed
675

676
677
	/* this protects tree_mod_seq_list */
	spinlock_t tree_mod_seq_lock;
678
	atomic64_t tree_mod_seq;
679
680
681
682
683
684
	struct list_head tree_mod_seq_list;

	/* this protects tree_mod_log */
	rwlock_t tree_mod_log_lock;
	struct rb_root tree_mod_log;

685
	atomic_t async_delalloc_pages;
686

687
	/*
688
	 * this is used to protect the following list -- ordered_roots.
689
	 */
690
	spinlock_t ordered_root_lock;
691
692

	/*
693
694
695
	 * all fs/file tree roots in which there are data=ordered extents
	 * pending writeback are added into this list.
	 *
696
697
698
	 * these can span multiple transactions and basically include
	 * every dirty data page that isn't from nodatacow
	 */
699
	struct list_head ordered_roots;
700

701
	struct mutex delalloc_root_mutex;
702
703
704
	spinlock_t delalloc_root_lock;
	/* all fs/file tree roots that have delalloc inodes. */
	struct list_head delalloc_roots;
705

706
707
708
709
710
711
	/*
	 * there is a pool of worker threads for checksumming during writes
	 * and a pool for checksumming after reads.  This is because readers
	 * can run with FS locks held, and the writers may be waiting for
	 * those locks.  We don't want ordering in the pending list to cause
	 * deadlocks, and so the two are serviced separately.
712
713
714
	 *
	 * A third pool does submit_bio to avoid deadlocking with the other
	 * two
715
	 */
716
717
718
719
720
721
	struct btrfs_workqueue *workers;
	struct btrfs_workqueue *delalloc_workers;
	struct btrfs_workqueue *flush_workers;
	struct btrfs_workqueue *endio_workers;
	struct btrfs_workqueue *endio_meta_workers;
	struct btrfs_workqueue *endio_raid56_workers;
722
	struct btrfs_workqueue *endio_repair_workers;
723
724
725
726
727
728
	struct btrfs_workqueue *rmw_workers;
	struct btrfs_workqueue *endio_meta_write_workers;
	struct btrfs_workqueue *endio_write_workers;
	struct btrfs_workqueue *endio_freespace_worker;
	struct btrfs_workqueue *caching_workers;
	struct btrfs_workqueue *readahead_workers;
729

730
731
732
733
734
	/*
	 * fixup workers take dirty pages that didn't properly go through
	 * the cow mechanism and make them safe to write.  It happens
	 * for the sys_munmap function call path
	 */
735
736
	struct btrfs_workqueue *fixup_workers;
	struct btrfs_workqueue *delayed_workers;
Chris Mason's avatar
Chris Mason committed
737

738
739
	struct task_struct *transaction_kthread;
	struct task_struct *cleaner_kthread;
740
	u32 thread_pool_size;
741

742
	struct kobject *space_info_kobj;
743

744
	u64 total_pinned;
745

746
747
	/* used to keep from writing metadata until there is a nice batch */
	struct percpu_counter dirty_metadata_bytes;
748
	struct percpu_counter delalloc_bytes;
Josef Bacik's avatar
Josef Bacik committed
749
	struct percpu_counter dio_bytes;
750
	s32 dirty_metadata_batch;
751
752
	s32 delalloc_batch;

753
754
	struct list_head dirty_cowonly_roots;

755
	struct btrfs_fs_devices *fs_devices;
756
757

	/*
758
759
760
	 * The space_info list is effectively read only after initial
	 * setup.  It is populated at mount time and cleaned up after
	 * all block groups are removed.  RCU is used to protect it.
761
	 */
762
	struct list_head space_info;
763

764
765
	struct btrfs_space_info *data_sinfo;

766
767
	struct reloc_control *reloc_ctl;

768
	/* data_alloc_cluster is only used in ssd_spread mode */
769
770
771
772
	struct btrfs_free_cluster data_alloc_cluster;

	/* all metadata allocations go through this cluster */
	struct btrfs_free_cluster meta_alloc_cluster;
773

Chris Mason's avatar
Chris Mason committed
774
775
776
777
778
	/* auto defrag inodes go here */
	spinlock_t defrag_inodes_lock;
	struct rb_root defrag_inodes;
	atomic_t defrag_running;

779
780
	/* Used to protect avail_{data, metadata, system}_alloc_bits */
	seqlock_t profiles_lock;
781
782
783
784
785
	/*
	 * these three are in extended format (availability of single
	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
	 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
	 */
786
787
788
	u64 avail_data_alloc_bits;
	u64 avail_metadata_alloc_bits;
	u64 avail_system_alloc_bits;
789

790
791
792
	/* restriper state */
	spinlock_t balance_lock;
	struct mutex balance_mutex;
793
	atomic_t balance_pause_req;
794
	atomic_t balance_cancel_req;
795
	struct btrfs_balance_control *balance_ctl;
796
	wait_queue_head_t balance_wait_q;
797

798
799
	u32 data_chunk_allocations;
	u32 metadata_ratio;
800

801
	void *bdev_holder;
802

Arne Jansen's avatar
Arne Jansen committed
803
804
805
806
807
808
809
	/* private scrub information */
	struct mutex scrub_lock;
	atomic_t scrubs_running;
	atomic_t scrub_pause_req;
	atomic_t scrubs_paused;
	atomic_t scrub_cancel_req;
	wait_queue_head_t scrub_pause_wait;
810
811
812
813
	/*
	 * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
	 * running.
	 */
814
	refcount_t scrub_workers_refcnt;
815
816
	struct btrfs_workqueue *scrub_workers;
	struct btrfs_workqueue *scrub_wr_completion_workers;
817
	struct btrfs_workqueue *scrub_parity_workers;
Arne Jansen's avatar
Arne Jansen committed
818

819
820
821
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
	u32 check_integrity_print_mask;
#endif
822
823
824
825
826
827
828
	/* is qgroup tracking in a consistent state? */
	u64 qgroup_flags;

	/* holds configuration and tracking. Protected by qgroup_lock */
	struct rb_root qgroup_tree;
	spinlock_t qgroup_lock;

829
830
831
832
833
834
	/*
	 * used to avoid frequently calling ulist_alloc()/ulist_free()
	 * when doing qgroup accounting, it must be protected by qgroup_lock.
	 */
	struct ulist *qgroup_ulist;

835
836
837
	/* protect user change for quota operations */
	struct mutex qgroup_ioctl_lock;

838
839
840
	/* list of dirty qgroups to be written at next commit */
	struct list_head dirty_qgroups;

841
	/* used by qgroup for an efficient tree traversal */
842
	u64 qgroup_seq;
843

Jan Schmidt's avatar
Jan Schmidt committed
844
845
846
	/* qgroup rescan items */
	struct mutex qgroup_rescan_lock; /* protects the progress item */
	struct btrfs_key qgroup_rescan_progress;
847
	struct btrfs_workqueue *qgroup_rescan_workers;
848
	struct completion qgroup_rescan_completion;
849
	struct btrfs_work qgroup_rescan_work;
850
	bool qgroup_rescan_running;	/* protected by qgroup_rescan_lock */
Jan Schmidt's avatar
Jan Schmidt committed
851

852
	/* filesystem state */
853
	unsigned long fs_state;
854
855

	struct btrfs_delayed_root *delayed_root;
856

857
858
859
	/* readahead tree */
	spinlock_t reada_lock;
	struct radix_tree_root reada_tree;
860

Zhao Lei's avatar
Zhao Lei committed
861
862
863
	/* readahead works cnt */
	atomic_t reada_works_cnt;

864
865
866
867
	/* Extent buffer radix tree */
	spinlock_t buffer_lock;
	struct radix_tree_root buffer_radix;

868
869
	/* next backup root to be overwritten */
	int backup_root_index;
870

871
872
	/* device replace state */
	struct btrfs_dev_replace dev_replace;
873

874
	struct semaphore uuid_tree_rescan_sem;
875
876
877

	/* Used to reclaim the metadata space in the background. */
	struct work_struct async_reclaim_work;
878
879
880

	spinlock_t unused_bgs_lock;
	struct list_head unused_bgs;
881
	struct mutex unused_bg_unpin_mutex;
882
	struct mutex delete_unused_bgs_mutex;
883

884
885
886
887
	/* Cached block sizes */
	u32 nodesize;
	u32 sectorsize;
	u32 stripesize;
888

889
890
891
892
	/* Block groups and devices containing active swapfiles. */
	spinlock_t swapfile_pins_lock;
	struct rb_root swapfile_pins;

893
894
	struct crypto_shash *csum_shash;

895
896
897
898
899
900
	/*
	 * Number of send operations in progress.
	 * Updated while holding fs_info::balance_mutex.
	 */
	int send_in_progress;

901
902
903
904
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
	spinlock_t ref_verify_lock;
	struct rb_root block_tree;
#endif
905
};
906

907
908
909
910
911
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
	return sb->s_fs_info;
}

912
913
914
915
916
struct btrfs_subvolume_writers {
	struct percpu_counter	counter;
	wait_queue_head_t	wait;
};

917
918
919
/*
 * The state of btrfs root
 */
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
enum {
	/*
	 * btrfs_record_root_in_trans is a multi-step process, and it can race
	 * with the balancing code.   But the race is very small, and only the
	 * first time the root is added to each transaction.  So IN_TRANS_SETUP
	 * is used to tell us when more checks are required
	 */
	BTRFS_ROOT_IN_TRANS_SETUP,
	BTRFS_ROOT_REF_COWS,
	BTRFS_ROOT_TRACK_DIRTY,
	BTRFS_ROOT_IN_RADIX,
	BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
	BTRFS_ROOT_DEFRAG_RUNNING,
	BTRFS_ROOT_FORCE_COW,
	BTRFS_ROOT_MULTI_LOG_TASKS,
	BTRFS_ROOT_DIRTY,
936
	BTRFS_ROOT_DELETING,
937
938
939
940
941
942
943

	/*
	 * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
	 *
	 * Set for the subvolume tree owning the reloc tree.
	 */
	BTRFS_ROOT_DEAD_RELOC_TREE,
944
945
	/* Mark dead root stored on device whose cleanup needs to be resumed */
	BTRFS_ROOT_DEAD_TREE,
946
};
947

948
949
950
951
952
953
954
955
956
957
958
/*
 * Record swapped tree blocks of a subvolume tree for delayed subtree trace
 * code. For detail check comment in fs/btrfs/qgroup.c.
 */
struct btrfs_qgroup_swapped_blocks {
	spinlock_t lock;
	/* RM_EMPTY_ROOT() of above blocks[] */
	bool swapped;
	struct rb_root blocks[BTRFS_MAX_LEVEL];
};

959
960
/*
 * in ram representation of the tree.  extent_root is used for all allocations
961
 * and for the extent tree extent_root root.
962
963
 */
struct btrfs_root {
964
	struct extent_buffer *node;
965

966
	struct extent_buffer *commit_root;
967
	struct btrfs_root *log_root;
Zheng Yan's avatar
Zheng Yan committed
968
	struct btrfs_root *reloc_root;
Yan Zheng's avatar
Yan Zheng committed
969

970
	unsigned long state;
971
972
	struct btrfs_root_item root_item;
	struct btrfs_key root_key;
973
	struct btrfs_fs_info *fs_info;
974
975
	struct extent_io_tree dirty_log_pages;

976
	struct mutex objectid_mutex;
Yan Zheng's avatar
Yan Zheng committed
977

978
979
980
	spinlock_t accounting_lock;
	struct btrfs_block_rsv *block_rsv;

981
982
	/* free ino cache stuff */
	struct btrfs_free_space_ctl *free_ino_ctl;
983
984
985
	enum btrfs_caching_type ino_cache_state;
	spinlock_t ino_cache_lock;
	wait_queue_head_t ino_cache_wait;
986
	struct btrfs_free_space_ctl *free_ino_pinned;
987
988
	u64 ino_cache_progress;
	struct inode *ino_cache_inode;
989

990
	struct mutex log_mutex;
Yan Zheng's avatar
Yan Zheng committed
991
992
	wait_queue_head_t log_writer_wait;
	wait_queue_head_t log_commit_wait[2];
993
	struct list_head log_ctxs[2];
Yan Zheng's avatar
Yan Zheng committed
994
995
	atomic_t log_writers;
	atomic_t log_commit[2];
Miao Xie's avatar
Miao Xie committed
996
	atomic_t log_batch;
997
	int log_transid;
998
999
1000
	/* No matter the commit succeeds or not*/
	int log_transid_committed;
	/* Just be updated when the commit succeeds. */
1001
	int last_log_commit;
1002
	pid_t log_start_pid;
1003

1004
	u64 last_trans;
1005

1006
	u32 type;
1007
1008

	u64 highest_objectid;