volumes.c 201 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
3
4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6
7
#include <linux/sched.h>
#include <linux/bio.h>
8
#include <linux/slab.h>
9
#include <linux/buffer_head.h>
10
#include <linux/blkdev.h>
11
#include <linux/ratelimit.h>
Ilya Dryomov's avatar
Ilya Dryomov committed
12
#include <linux/kthread.h>
David Woodhouse's avatar
David Woodhouse committed
13
#include <linux/raid/pq.h>
14
#include <linux/semaphore.h>
15
#include <linux/uuid.h>
Anand Jain's avatar
Anand Jain committed
16
#include <linux/list_sort.h>
17
#include "misc.h"
18
19
20
21
22
23
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
David Woodhouse's avatar
David Woodhouse committed
24
#include "raid56.h"
25
#include "async-thread.h"
26
#include "check-integrity.h"
27
#include "rcu-string.h"
28
#include "dev-replace.h"
29
#include "sysfs.h"
30
#include "tree-checker.h"
31
#include "space-info.h"
32
#include "block-group.h"
33

34
35
36
37
38
39
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
40
		.tolerated_failures = 1,
41
42
		.devs_increment	= 2,
		.ncopies	= 2,
43
		.nparity        = 0,
44
		.raid_name	= "raid10",
45
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
46
		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
47
48
49
50
51
52
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
53
		.tolerated_failures = 1,
54
55
		.devs_increment	= 2,
		.ncopies	= 2,
56
		.nparity        = 0,
57
		.raid_name	= "raid1",
58
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
59
		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
60
	},
61
62
63
	[BTRFS_RAID_RAID1C3] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
64
		.devs_max	= 3,
65
66
67
68
69
70
71
72
		.devs_min	= 3,
		.tolerated_failures = 2,
		.devs_increment	= 3,
		.ncopies	= 3,
		.raid_name	= "raid1c3",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
	},
73
74
75
	[BTRFS_RAID_RAID1C4] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
76
		.devs_max	= 4,
77
78
79
80
81
82
83
84
		.devs_min	= 4,
		.tolerated_failures = 3,
		.devs_increment	= 4,
		.ncopies	= 4,
		.raid_name	= "raid1c4",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
	},
85
86
87
88
89
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
90
		.tolerated_failures = 0,
91
92
		.devs_increment	= 1,
		.ncopies	= 2,
93
		.nparity        = 0,
94
		.raid_name	= "dup",
95
		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
96
		.mindev_error	= 0,
97
98
99
100
101
102
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
103
		.tolerated_failures = 0,
104
105
		.devs_increment	= 1,
		.ncopies	= 1,
106
		.nparity        = 0,
107
		.raid_name	= "raid0",
108
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
109
		.mindev_error	= 0,
110
111
112
113
114
115
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
116
		.tolerated_failures = 0,
117
118
		.devs_increment	= 1,
		.ncopies	= 1,
119
		.nparity        = 0,
120
		.raid_name	= "single",
121
		.bg_flag	= 0,
122
		.mindev_error	= 0,
123
124
125
126
127
128
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
129
		.tolerated_failures = 1,
130
		.devs_increment	= 1,
131
		.ncopies	= 1,
132
		.nparity        = 1,
133
		.raid_name	= "raid5",
134
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
135
		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
136
137
138
139
140
141
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
142
		.tolerated_failures = 2,
143
		.devs_increment	= 1,
144
		.ncopies	= 1,
145
		.nparity        = 2,
146
		.raid_name	= "raid6",
147
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
148
		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
149
150
151
	},
};

152
const char *btrfs_bg_type_to_raid_name(u64 flags)
153
{
154
155
156
	const int index = btrfs_bg_flags_to_raid_index(flags);

	if (index >= BTRFS_NR_RAID_TYPES)
157
158
		return NULL;

159
	return btrfs_raid_array[index].raid_name;
160
161
}

162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/*
 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 * bytes including terminating null byte.
 */
void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
{
	int i;
	int ret;
	char *bp = buf;
	u64 flags = bg_flags;
	u32 size_bp = size_buf;

	if (!flags) {
		strcpy(bp, "NONE");
		return;
	}

#define DESCRIBE_FLAG(flag, desc)						\
	do {								\
		if (flags & (flag)) {					\
			ret = snprintf(bp, size_bp, "%s|", (desc));	\
			if (ret < 0 || ret >= size_bp)			\
				goto out_overflow;			\
			size_bp -= ret;					\
			bp += ret;					\
			flags &= ~(flag);				\
		}							\
	} while (0)

	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");

	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
			      btrfs_raid_array[i].raid_name);
#undef DESCRIBE_FLAG

	if (flags) {
		ret = snprintf(bp, size_bp, "0x%llx|", flags);
		size_bp -= ret;
	}

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */

	/*
	 * The text is trimmed, it's up to the caller to provide sufficiently
	 * large buffer
	 */
out_overflow:;
}

216
static int init_first_rw_device(struct btrfs_trans_handle *trans);
217
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
218
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
219
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
220
221
222
223
224
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Yan Zheng's avatar
Yan Zheng committed
225

David Sterba's avatar
David Sterba committed
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
242
 * seeding, structure cloning, opening/closing devices at mount/umount time
David Sterba's avatar
David Sterba committed
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
 *
 * global::fs_devs - add, remove, updates to the global list
 *
 * does not protect: manipulation of the fs_devices::devices list!
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
267
268
269
 * device is added/removed. Additionally it also protects post_commit_list of
 * individual devices, since they can be added to the transaction's
 * post_commit_list only with chunk_mutex held.
David Sterba's avatar
David Sterba committed
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
 *   volume_mutex
 *     device_list_mutex
 *       chunk_mutex
 *     balance_mutex
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
 *
 *
 * Exclusive operations, BTRFS_FS_EXCL_OP
 * ======================================
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
 * completed.
David Sterba's avatar
David Sterba committed
320
321
 */

322
DEFINE_MUTEX(uuid_mutex);
323
static LIST_HEAD(fs_uuids);
324
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
325
326
327
{
	return &fs_uuids;
}
328

329
330
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
331
332
 * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
 * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
333
334
335
336
337
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
338
339
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
						 const u8 *metadata_fsid)
340
341
342
{
	struct btrfs_fs_devices *fs_devs;

343
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
344
345
346
347
348
349
350
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
	INIT_LIST_HEAD(&fs_devs->alloc_list);
351
	INIT_LIST_HEAD(&fs_devs->fs_list);
352
353
354
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

355
356
357
358
359
	if (metadata_fsid)
		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
	else if (fsid)
		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);

360
361
362
	return fs_devs;
}

363
void btrfs_free_device(struct btrfs_device *device)
364
{
365
	WARN_ON(!list_empty(&device->post_commit_list));
366
	rcu_string_free(device->name);
367
	extent_io_tree_release(&device->alloc_state);
368
369
370
371
	bio_put(device->flush_bio);
	kfree(device);
}

Yan Zheng's avatar
Yan Zheng committed
372
373
374
375
376
377
378
379
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
380
		btrfs_free_device(device);
Yan Zheng's avatar
Yan Zheng committed
381
382
383
384
	}
	kfree(fs_devices);
}

385
void __exit btrfs_cleanup_fs_uuids(void)
386
387
388
{
	struct btrfs_fs_devices *fs_devices;

Yan Zheng's avatar
Yan Zheng committed
389
390
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
391
392
					struct btrfs_fs_devices, fs_list);
		list_del(&fs_devices->fs_list);
Yan Zheng's avatar
Yan Zheng committed
393
		free_fs_devices(fs_devices);
394
395
396
	}
}

397
398
399
/*
 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 * Returned struct is not linked onto any lists and must be destroyed using
400
 * btrfs_free_device.
401
 */
402
403
404
405
static struct btrfs_device *__alloc_device(void)
{
	struct btrfs_device *dev;

406
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
407
408
409
	if (!dev)
		return ERR_PTR(-ENOMEM);

410
411
412
413
414
415
416
417
418
419
	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

420
421
	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
422
	INIT_LIST_HEAD(&dev->post_commit_list);
423
424

	atomic_set(&dev->reada_in_flight, 0);
425
	atomic_set(&dev->dev_stats_ccnt, 0);
426
	btrfs_device_data_ordered_init(dev);
427
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
428
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
429
	extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
430
431
432
433

	return dev;
}

434
435
static noinline struct btrfs_fs_devices *find_fsid(
		const u8 *fsid, const u8 *metadata_fsid)
436
437
438
{
	struct btrfs_fs_devices *fs_devices;

439
440
	ASSERT(fsid);

441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
	if (metadata_fsid) {
		/*
		 * Handle scanned device having completed its fsid change but
		 * belonging to a fs_devices that was created by first scanning
		 * a device which didn't have its fsid/metadata_uuid changed
		 * at all and the CHANGING_FSID_V2 flag set.
		 */
		list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
			if (fs_devices->fsid_change &&
			    memcmp(metadata_fsid, fs_devices->fsid,
				   BTRFS_FSID_SIZE) == 0 &&
			    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
				   BTRFS_FSID_SIZE) == 0) {
				return fs_devices;
			}
		}
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
		/*
		 * Handle scanned device having completed its fsid change but
		 * belonging to a fs_devices that was created by a device that
		 * has an outdated pair of fsid/metadata_uuid and
		 * CHANGING_FSID_V2 flag set.
		 */
		list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
			if (fs_devices->fsid_change &&
			    memcmp(fs_devices->metadata_uuid,
				   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
			    memcmp(metadata_fsid, fs_devices->metadata_uuid,
				   BTRFS_FSID_SIZE) == 0) {
				return fs_devices;
			}
		}
472
473
474
	}

	/* Handle non-split brain cases */
475
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
476
477
478
479
480
481
482
483
484
		if (metadata_fsid) {
			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
				      BTRFS_FSID_SIZE) == 0)
				return fs_devices;
		} else {
			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
				return fs_devices;
		}
485
486
487
488
	}
	return NULL;
}

489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
		      struct buffer_head **bh)
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
505
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
506
507
508
509
510
511
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
	*bh = btrfs_read_dev_super(*bdev);
512
513
	if (IS_ERR(*bh)) {
		ret = PTR_ERR(*bh);
514
515
516
517
518
519
520
521
522
523
524
525
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	*bh = NULL;
	return ret;
}

526
527
528
529
530
531
532
533
534
535
536
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
	int found;

	rcu_read_lock();
	found = strcmp(rcu_str_deref(device->name), path);
	rcu_read_unlock();

	return found == 0;
}

537
538
539
540
541
542
543
/*
 *  Search and remove all stale (devices which are not mounted) devices.
 *  When both inputs are NULL, it will search and release all stale devices.
 *  path:	Optional. When provided will it release all unmounted devices
 *		matching this path only.
 *  skip_dev:	Optional. Will skip this device when searching for the stale
 *		devices.
544
545
546
 *  Return:	0 for success or if @path is NULL.
 * 		-EBUSY if @path is a mounted device.
 * 		-ENOENT if @path does not match any device in the list.
547
 */
548
static int btrfs_free_stale_devices(const char *path,
549
				     struct btrfs_device *skip_device)
Anand Jain's avatar
Anand Jain committed
550
{
551
552
	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
	struct btrfs_device *device, *tmp_device;
553
554
555
556
	int ret = 0;

	if (path)
		ret = -ENOENT;
Anand Jain's avatar
Anand Jain committed
557

558
	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
Anand Jain's avatar
Anand Jain committed
559

560
		mutex_lock(&fs_devices->device_list_mutex);
561
562
563
		list_for_each_entry_safe(device, tmp_device,
					 &fs_devices->devices, dev_list) {
			if (skip_device && skip_device == device)
564
				continue;
565
			if (path && !device->name)
Anand Jain's avatar
Anand Jain committed
566
				continue;
567
			if (path && !device_path_matched(path, device))
568
				continue;
569
570
571
572
573
574
			if (fs_devices->opened) {
				/* for an already deleted device return 0 */
				if (path && ret != 0)
					ret = -EBUSY;
				break;
			}
Anand Jain's avatar
Anand Jain committed
575
576

			/* delete the stale device */
577
578
579
580
			fs_devices->num_devices--;
			list_del(&device->dev_list);
			btrfs_free_device(device);

581
			ret = 0;
582
			if (fs_devices->num_devices == 0)
583
				break;
584
585
		}
		mutex_unlock(&fs_devices->device_list_mutex);
586

587
588
589
590
		if (fs_devices->num_devices == 0) {
			btrfs_sysfs_remove_fsid(fs_devices);
			list_del(&fs_devices->fs_list);
			free_fs_devices(fs_devices);
Anand Jain's avatar
Anand Jain committed
591
592
		}
	}
593
594

	return ret;
Anand Jain's avatar
Anand Jain committed
595
596
}

597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
			struct btrfs_device *device, fmode_t flags,
			void *holder)
{
	struct request_queue *q;
	struct block_device *bdev;
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				    &bdev, &bh);
	if (ret)
		return ret;

	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
		goto error_brelse;

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
		goto error_brelse;

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
629
630
631
632
633
634
635
		if (btrfs_super_incompat_flags(disk_super) &
		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
			pr_err(
		"BTRFS: Invalid seeding and uuid-changed device detected\n");
			goto error_brelse;
		}

636
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
637
		fs_devices->seeding = true;
638
	} else {
639
640
641
642
		if (bdev_read_only(bdev))
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
643
644
645
646
	}

	q = bdev_get_queue(bdev);
	if (!blk_queue_nonrot(q))
647
		fs_devices->rotating = true;
648
649

	device->bdev = bdev;
650
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
651
652
653
	device->mode = flags;

	fs_devices->open_devices++;
654
655
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
656
		fs_devices->rw_devices++;
657
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
658
659
660
661
662
663
664
665
666
667
668
669
	}
	brelse(bh);

	return 0;

error_brelse:
	brelse(bh);
	blkdev_put(bdev, flags);

	return -EINVAL;
}

670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
/*
 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 * being created with a disk that has already completed its fsid change.
 */
static struct btrfs_fs_devices *find_fsid_inprogress(
					struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
			return fs_devices;
		}
	}

	return NULL;
}

691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714

static struct btrfs_fs_devices *find_fsid_changed(
					struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	/*
	 * Handles the case where scanned device is part of an fs that had
	 * multiple successful changes of FSID but curently device didn't
	 * observe it. Meaning our fsid will be different than theirs.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0 &&
		    memcmp(fs_devices->fsid, disk_super->fsid,
			   BTRFS_FSID_SIZE) != 0) {
			return fs_devices;
		}
	}

	return NULL;
}
715
716
717
718
/*
 * Add new device to list of registered devices
 *
 * Returns:
719
720
 * device pointer which was just added or updated when successful
 * error pointer when failed
721
 */
722
static noinline struct btrfs_device *device_list_add(const char *path,
723
724
			   struct btrfs_super_block *disk_super,
			   bool *new_device_added)
725
726
{
	struct btrfs_device *device;
727
	struct btrfs_fs_devices *fs_devices = NULL;
728
	struct rcu_string *name;
729
	u64 found_transid = btrfs_super_generation(disk_super);
730
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
731
732
	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
733
734
	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
735

736
737
738
739
740
741
742
743
744
745
746
747
748
749
	if (fsid_change_in_progress) {
		if (!has_metadata_uuid) {
			/*
			 * When we have an image which has CHANGING_FSID_V2 set
			 * it might belong to either a filesystem which has
			 * disks with completed fsid change or it might belong
			 * to fs with no UUID changes in effect, handle both.
			 */
			fs_devices = find_fsid_inprogress(disk_super);
			if (!fs_devices)
				fs_devices = find_fsid(disk_super->fsid, NULL);
		} else {
			fs_devices = find_fsid_changed(disk_super);
		}
750
751
752
753
	} else if (has_metadata_uuid) {
		fs_devices = find_fsid(disk_super->fsid,
				       disk_super->metadata_uuid);
	} else {
754
		fs_devices = find_fsid(disk_super->fsid, NULL);
755
756
	}

757
758

	if (!fs_devices) {
759
760
761
762
763
764
		if (has_metadata_uuid)
			fs_devices = alloc_fs_devices(disk_super->fsid,
						      disk_super->metadata_uuid);
		else
			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);

765
		if (IS_ERR(fs_devices))
766
			return ERR_CAST(fs_devices);
767

768
769
		fs_devices->fsid_change = fsid_change_in_progress;

770
		mutex_lock(&fs_devices->device_list_mutex);
771
		list_add(&fs_devices->fs_list, &fs_uuids);
772

773
774
		device = NULL;
	} else {
775
		mutex_lock(&fs_devices->device_list_mutex);
776
777
		device = btrfs_find_device(fs_devices, devid,
				disk_super->dev_item.uuid, NULL, false);
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792

		/*
		 * If this disk has been pulled into an fs devices created by
		 * a device which had the CHANGING_FSID_V2 flag then replace the
		 * metadata_uuid/fsid values of the fs_devices.
		 */
		if (has_metadata_uuid && fs_devices->fsid_change &&
		    found_transid > fs_devices->latest_generation) {
			memcpy(fs_devices->fsid, disk_super->fsid,
					BTRFS_FSID_SIZE);
			memcpy(fs_devices->metadata_uuid,
					disk_super->metadata_uuid, BTRFS_FSID_SIZE);

			fs_devices->fsid_change = false;
		}
793
	}
794

795
	if (!device) {
796
797
		if (fs_devices->opened) {
			mutex_unlock(&fs_devices->device_list_mutex);
798
			return ERR_PTR(-EBUSY);
799
		}
Yan Zheng's avatar
Yan Zheng committed
800

801
802
803
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
804
			mutex_unlock(&fs_devices->device_list_mutex);
805
			/* we can safely leave the fs_devices entry around */
806
			return device;
807
		}
808
809
810

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
811
			btrfs_free_device(device);
812
			mutex_unlock(&fs_devices->device_list_mutex);
813
			return ERR_PTR(-ENOMEM);
814
		}
815
		rcu_assign_pointer(device->name, name);
816

817
		list_add_rcu(&device->dev_list, &fs_devices->devices);
818
		fs_devices->num_devices++;
819

Yan Zheng's avatar
Yan Zheng committed
820
		device->fs_devices = fs_devices;
821
		*new_device_added = true;
822
823

		if (disk_super->label[0])
824
825
826
827
			pr_info(
	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
				disk_super->label, devid, found_transid, path,
				current->comm, task_pid_nr(current));
828
		else
829
830
831
832
			pr_info(
	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
				disk_super->fsid, devid, found_transid, path,
				current->comm, task_pid_nr(current));
833

834
	} else if (!device->name || strcmp(device->name->str, path)) {
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
856
857
858
859
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
860
		 */
861
		if (!fs_devices->opened && found_transid < device->generation) {
862
863
864
865
866
867
868
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
869
			mutex_unlock(&fs_devices->device_list_mutex);
870
			return ERR_PTR(-EEXIST);
871
		}
872

873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
		/*
		 * We are going to replace the device path for a given devid,
		 * make sure it's the same device if the device is mounted
		 */
		if (device->bdev) {
			struct block_device *path_bdev;

			path_bdev = lookup_bdev(path);
			if (IS_ERR(path_bdev)) {
				mutex_unlock(&fs_devices->device_list_mutex);
				return ERR_CAST(path_bdev);
			}

			if (device->bdev != path_bdev) {
				bdput(path_bdev);
				mutex_unlock(&fs_devices->device_list_mutex);
				btrfs_warn_in_rcu(device->fs_info,
			"duplicate device fsid:devid for %pU:%llu old:%s new:%s",
					disk_super->fsid, devid,
					rcu_str_deref(device->name), path);
				return ERR_PTR(-EEXIST);
			}
			bdput(path_bdev);
			btrfs_info_in_rcu(device->fs_info,
				"device fsid %pU devid %llu moved old:%s new:%s",
				disk_super->fsid, devid,
				rcu_str_deref(device->name), path);
		}

902
		name = rcu_string_strdup(path, GFP_NOFS);
903
904
		if (!name) {
			mutex_unlock(&fs_devices->device_list_mutex);
905
			return ERR_PTR(-ENOMEM);
906
		}
907
908
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
909
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
910
			fs_devices->missing_devices--;
911
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
912
		}
913
914
	}

915
916
917
918
919
920
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
921
	if (!fs_devices->opened) {
922
		device->generation = found_transid;
923
924
925
		fs_devices->latest_generation = max_t(u64, found_transid,
						fs_devices->latest_generation);
	}
926

927
928
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

929
	mutex_unlock(&fs_devices->device_list_mutex);
930
	return device;
931
932
}

Yan Zheng's avatar
Yan Zheng committed
933
934
935
936
937
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;
938
	int ret = 0;
Yan Zheng's avatar
Yan Zheng committed
939

940
	fs_devices = alloc_fs_devices(orig->fsid, NULL);
941
942
	if (IS_ERR(fs_devices))
		return fs_devices;
Yan Zheng's avatar
Yan Zheng committed
943

944
	mutex_lock(&orig->device_list_mutex);
Josef Bacik's avatar
Josef Bacik committed
945
	fs_devices->total_devices = orig->total_devices;
Yan Zheng's avatar
Yan Zheng committed
946
947

	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
948
949
		struct rcu_string *name;

950
951
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
952
953
		if (IS_ERR(device)) {
			ret = PTR_ERR(device);
Yan Zheng's avatar
Yan Zheng committed
954
			goto error;
955
		}
Yan Zheng's avatar
Yan Zheng committed
956

957
958
959
960
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
961
		if (orig_dev->name) {
962
963
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
964
			if (!name) {
965
				btrfs_free_device(device);
966
				ret = -ENOMEM;
967
968
969
				goto error;
			}
			rcu_assign_pointer(device->name, name);
Julia Lawall's avatar
Julia Lawall committed
970
		}
Yan Zheng's avatar
Yan Zheng committed
971
972
973
974
975

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
976
	mutex_unlock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
977
978
	return fs_devices;
error:
979
	mutex_unlock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
980
	free_fs_devices(fs_devices);
981
	return ERR_PTR(ret);
Yan Zheng's avatar
Yan Zheng committed
982
983
}

984
985
986
987
988
/*
 * After we have read the system tree and know devids belonging to
 * this filesystem, remove the device which does not belong there.
 */
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
989
{
990
	struct btrfs_device *device, *next;
991
	struct btrfs_device *latest_dev = NULL;
992

993
994
	mutex_lock(&uuid_mutex);
again:
995
	/* This is the initialized path, it is safe to release the devices. */
996
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
997
998
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
							&device->dev_state)) {
999
1000
1001
1002
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
			     &device->dev_state) &&
			     (!latest_dev ||
			      device->generation > latest_dev->generation)) {
1003
				latest_dev = device;
1004
			}
Yan Zheng's avatar
Yan Zheng committed
1005
			continue;
1006
		}
Yan Zheng's avatar
Yan Zheng committed
1007

1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			/*
			 * In the first step, keep the device which has
			 * the correct fsid and the devid that is used
			 * for the dev_replace procedure.
			 * In the second step, the dev_replace state is
			 * read from the device tree and it is known
			 * whether the procedure is really active or
			 * not, which means whether this device is
			 * used or whether it should be removed.
			 */
1019
1020
			if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
						  &device->dev_state)) {
1021
1022
1023
				continue;
			}
		}
Yan Zheng's avatar
Yan Zheng committed
1024
		if (device->bdev) {
1025
			blkdev_put(device->bdev, device->mode);
Yan Zheng's avatar
Yan Zheng committed
1026
1027
1028
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
1029
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Yan Zheng's avatar
Yan Zheng committed
1030
			list_del_init(&device->dev_alloc_list);
1031
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1032
1033
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				      &device->dev_state))
1034
				fs_devices->rw_devices--;
Yan Zheng's avatar
Yan Zheng committed
1035
		}
Yan Zheng's avatar
Yan Zheng committed
1036
1037
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
1038
		btrfs_free_device(device);
1039
	}
Yan Zheng's avatar
Yan Zheng committed
1040
1041
1042
1043
1044
1045

	if (fs_devices->seed) {
		fs_devices = fs_devices->seed;
		goto again;
	}

1046
	fs_devices->latest_bdev = latest_dev->bdev;
1047

1048
1049
	mutex_unlock(&uuid_mutex);
}
1050

1051
1052
static void btrfs_close_bdev(struct btrfs_device *device)
{
1053
1054
1055
	if (!device->bdev)
		return;

1056
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1057
1058
1059
1060
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

1061
	blkdev_put(device->bdev, device->mode);
1062
1063
}

1064
static void btrfs_close_one_device(struct btrfs_device *device)
1065
1066
1067
1068
1069
1070
1071
1072
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;
	struct btrfs_device *new_device;
	struct rcu_string *name;

	if (device->bdev)
		fs_devices->open_devices--;

1073
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1074
1075
1076
1077
1078
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

1079
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1080
1081
		fs_devices->missing_devices--;

1082
1083
	btrfs_close_bdev(device);

1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
	new_device = btrfs_alloc_device(NULL, &device->devid,
					device->uuid);
	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */

	/* Safe because we are under uuid_mutex */
	if (device->name) {
		name = rcu_string_strdup(device->name->str, GFP_NOFS);
		BUG_ON(!name); /* -ENOMEM */
		rcu_assign_pointer(new_device->name, name);
	}

	list_replace_rcu(&device->dev_list, &new_device->dev_list);
	new_device->fs_devices = device->fs_devices;
1097

1098
1099
	synchronize_rcu();
	btrfs_free_device(device);
1100
1101
}

1102
static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1103
{
1104
	struct btrfs_device *device, *tmp;
Yan Zheng's avatar
Yan Zheng committed
1105

Yan Zheng's avatar
Yan Zheng committed
1106
1107
	if (--fs_devices->opened > 0)
		return 0;
1108

1109
	mutex_lock(&fs_devices->device_list_mutex);
1110
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1111
		btrfs_close_one_device(device);
1112
	}
1113
1114
	mutex_unlock(&fs_devices->device_list_mutex);

Yan Zheng's avatar
Yan Zheng committed
1115
1116
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Yan Zheng's avatar
Yan Zheng committed
1117
	fs_devices->opened = 0;
1118
	fs_devices->seeding = false;
Yan Zheng's avatar
Yan Zheng committed
1119

1120
1121
1122
	return 0;
}

Yan Zheng's avatar
Yan Zheng committed
1123
1124
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
Yan Zheng's avatar
Yan Zheng committed
1125
	struct btrfs_fs_devices *seed_devices = NULL;
Yan Zheng's avatar
Yan Zheng committed
1126
1127
1128
	int ret;

	mutex_lock(&uuid_mutex);
1129
	ret = close_fs_devices(fs_devices);
Yan Zheng's avatar
Yan Zheng committed
1130
1131
1132
1133
	if (!fs_devices->opened) {
		seed_devices = fs_devices->seed;
		fs_devices->seed = NULL;
	}
Yan Zheng's avatar
Yan Zheng committed
1134
	mutex_unlock(&uuid_mutex);
Yan Zheng's avatar
Yan Zheng committed
1135
1136
1137
1138

	while (