volumes.c 200 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
3
4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6
7
#include <linux/sched.h>
#include <linux/bio.h>
8
#include <linux/slab.h>
9
#include <linux/buffer_head.h>
10
#include <linux/blkdev.h>
11
#include <linux/ratelimit.h>
Ilya Dryomov's avatar
Ilya Dryomov committed
12
#include <linux/kthread.h>
David Woodhouse's avatar
David Woodhouse committed
13
#include <linux/raid/pq.h>
14
#include <linux/semaphore.h>
15
#include <linux/uuid.h>
Anand Jain's avatar
Anand Jain committed
16
#include <linux/list_sort.h>
17
#include "misc.h"
18
19
20
21
22
23
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
David Woodhouse's avatar
David Woodhouse committed
24
#include "raid56.h"
25
#include "async-thread.h"
26
#include "check-integrity.h"
27
#include "rcu-string.h"
28
#include "dev-replace.h"
29
#include "sysfs.h"
30
#include "tree-checker.h"
31
#include "space-info.h"
32
#include "block-group.h"
33

34
35
36
37
38
39
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
40
		.tolerated_failures = 1,
41
42
		.devs_increment	= 2,
		.ncopies	= 2,
43
		.nparity        = 0,
44
		.raid_name	= "raid10",
45
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
46
		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
47
48
49
50
51
52
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
53
		.tolerated_failures = 1,
54
55
		.devs_increment	= 2,
		.ncopies	= 2,
56
		.nparity        = 0,
57
		.raid_name	= "raid1",
58
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
59
		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
60
	},
61
62
63
64
65
66
67
68
69
70
71
72
	[BTRFS_RAID_RAID1C3] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
		.tolerated_failures = 2,
		.devs_increment	= 3,
		.ncopies	= 3,
		.raid_name	= "raid1c3",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
	},
73
74
75
76
77
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
78
		.tolerated_failures = 0,
79
80
		.devs_increment	= 1,
		.ncopies	= 2,
81
		.nparity        = 0,
82
		.raid_name	= "dup",
83
		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
84
		.mindev_error	= 0,
85
86
87
88
89
90
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
91
		.tolerated_failures = 0,
92
93
		.devs_increment	= 1,
		.ncopies	= 1,
94
		.nparity        = 0,
95
		.raid_name	= "raid0",
96
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
97
		.mindev_error	= 0,
98
99
100
101
102
103
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
104
		.tolerated_failures = 0,
105
106
		.devs_increment	= 1,
		.ncopies	= 1,
107
		.nparity        = 0,
108
		.raid_name	= "single",
109
		.bg_flag	= 0,
110
		.mindev_error	= 0,
111
112
113
114
115
116
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
117
		.tolerated_failures = 1,
118
		.devs_increment	= 1,
119
		.ncopies	= 1,
120
		.nparity        = 1,
121
		.raid_name	= "raid5",
122
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
123
		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
124
125
126
127
128
129
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
130
		.tolerated_failures = 2,
131
		.devs_increment	= 1,
132
		.ncopies	= 1,
133
		.nparity        = 2,
134
		.raid_name	= "raid6",
135
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
136
		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
137
138
139
	},
};

140
const char *btrfs_bg_type_to_raid_name(u64 flags)
141
{
142
143
144
	const int index = btrfs_bg_flags_to_raid_index(flags);

	if (index >= BTRFS_NR_RAID_TYPES)
145
146
		return NULL;

147
	return btrfs_raid_array[index].raid_name;
148
149
}

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/*
 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 * bytes including terminating null byte.
 */
void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
{
	int i;
	int ret;
	char *bp = buf;
	u64 flags = bg_flags;
	u32 size_bp = size_buf;

	if (!flags) {
		strcpy(bp, "NONE");
		return;
	}

#define DESCRIBE_FLAG(flag, desc)						\
	do {								\
		if (flags & (flag)) {					\
			ret = snprintf(bp, size_bp, "%s|", (desc));	\
			if (ret < 0 || ret >= size_bp)			\
				goto out_overflow;			\
			size_bp -= ret;					\
			bp += ret;					\
			flags &= ~(flag);				\
		}							\
	} while (0)

	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");

	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
			      btrfs_raid_array[i].raid_name);
#undef DESCRIBE_FLAG

	if (flags) {
		ret = snprintf(bp, size_bp, "0x%llx|", flags);
		size_bp -= ret;
	}

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */

	/*
	 * The text is trimmed, it's up to the caller to provide sufficiently
	 * large buffer
	 */
out_overflow:;
}

204
static int init_first_rw_device(struct btrfs_trans_handle *trans);
205
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
206
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
207
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
208
209
210
211
212
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Yan Zheng's avatar
Yan Zheng committed
213

David Sterba's avatar
David Sterba committed
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
230
 * seeding, structure cloning, opening/closing devices at mount/umount time
David Sterba's avatar
David Sterba committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
 *
 * global::fs_devs - add, remove, updates to the global list
 *
 * does not protect: manipulation of the fs_devices::devices list!
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
255
256
257
 * device is added/removed. Additionally it also protects post_commit_list of
 * individual devices, since they can be added to the transaction's
 * post_commit_list only with chunk_mutex held.
David Sterba's avatar
David Sterba committed
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
 *   volume_mutex
 *     device_list_mutex
 *       chunk_mutex
 *     balance_mutex
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
 *
 *
 * Exclusive operations, BTRFS_FS_EXCL_OP
 * ======================================
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
 * completed.
David Sterba's avatar
David Sterba committed
308
309
 */

310
DEFINE_MUTEX(uuid_mutex);
311
static LIST_HEAD(fs_uuids);
312
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
313
314
315
{
	return &fs_uuids;
}
316

317
318
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
319
320
 * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
 * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
321
322
323
324
325
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
326
327
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
						 const u8 *metadata_fsid)
328
329
330
{
	struct btrfs_fs_devices *fs_devs;

331
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
332
333
334
335
336
337
338
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
	INIT_LIST_HEAD(&fs_devs->alloc_list);
339
	INIT_LIST_HEAD(&fs_devs->fs_list);
340
341
342
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

343
344
345
346
347
	if (metadata_fsid)
		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
	else if (fsid)
		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);

348
349
350
	return fs_devs;
}

351
void btrfs_free_device(struct btrfs_device *device)
352
{
353
	WARN_ON(!list_empty(&device->post_commit_list));
354
	rcu_string_free(device->name);
355
	extent_io_tree_release(&device->alloc_state);
356
357
358
359
	bio_put(device->flush_bio);
	kfree(device);
}

Yan Zheng's avatar
Yan Zheng committed
360
361
362
363
364
365
366
367
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
368
		btrfs_free_device(device);
Yan Zheng's avatar
Yan Zheng committed
369
370
371
372
	}
	kfree(fs_devices);
}

373
void __exit btrfs_cleanup_fs_uuids(void)
374
375
376
{
	struct btrfs_fs_devices *fs_devices;

Yan Zheng's avatar
Yan Zheng committed
377
378
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
379
380
					struct btrfs_fs_devices, fs_list);
		list_del(&fs_devices->fs_list);
Yan Zheng's avatar
Yan Zheng committed
381
		free_fs_devices(fs_devices);
382
383
384
	}
}

385
386
387
/*
 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 * Returned struct is not linked onto any lists and must be destroyed using
388
 * btrfs_free_device.
389
 */
390
391
392
393
static struct btrfs_device *__alloc_device(void)
{
	struct btrfs_device *dev;

394
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
395
396
397
	if (!dev)
		return ERR_PTR(-ENOMEM);

398
399
400
401
402
403
404
405
406
407
	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

408
409
	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
410
	INIT_LIST_HEAD(&dev->post_commit_list);
411
412

	atomic_set(&dev->reada_in_flight, 0);
413
	atomic_set(&dev->dev_stats_ccnt, 0);
414
	btrfs_device_data_ordered_init(dev);
415
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
416
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
417
	extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
418
419
420
421

	return dev;
}

422
423
static noinline struct btrfs_fs_devices *find_fsid(
		const u8 *fsid, const u8 *metadata_fsid)
424
425
426
{
	struct btrfs_fs_devices *fs_devices;

427
428
	ASSERT(fsid);

429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
	if (metadata_fsid) {
		/*
		 * Handle scanned device having completed its fsid change but
		 * belonging to a fs_devices that was created by first scanning
		 * a device which didn't have its fsid/metadata_uuid changed
		 * at all and the CHANGING_FSID_V2 flag set.
		 */
		list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
			if (fs_devices->fsid_change &&
			    memcmp(metadata_fsid, fs_devices->fsid,
				   BTRFS_FSID_SIZE) == 0 &&
			    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
				   BTRFS_FSID_SIZE) == 0) {
				return fs_devices;
			}
		}
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
		/*
		 * Handle scanned device having completed its fsid change but
		 * belonging to a fs_devices that was created by a device that
		 * has an outdated pair of fsid/metadata_uuid and
		 * CHANGING_FSID_V2 flag set.
		 */
		list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
			if (fs_devices->fsid_change &&
			    memcmp(fs_devices->metadata_uuid,
				   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
			    memcmp(metadata_fsid, fs_devices->metadata_uuid,
				   BTRFS_FSID_SIZE) == 0) {
				return fs_devices;
			}
		}
460
461
462
	}

	/* Handle non-split brain cases */
463
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
464
465
466
467
468
469
470
471
472
		if (metadata_fsid) {
			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
				      BTRFS_FSID_SIZE) == 0)
				return fs_devices;
		} else {
			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
				return fs_devices;
		}
473
474
475
476
	}
	return NULL;
}

477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
		      struct buffer_head **bh)
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
493
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
494
495
496
497
498
499
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
	*bh = btrfs_read_dev_super(*bdev);
500
501
	if (IS_ERR(*bh)) {
		ret = PTR_ERR(*bh);
502
503
504
505
506
507
508
509
510
511
512
513
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	*bh = NULL;
	return ret;
}

514
515
516
517
518
519
520
521
522
523
524
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
	int found;

	rcu_read_lock();
	found = strcmp(rcu_str_deref(device->name), path);
	rcu_read_unlock();

	return found == 0;
}

525
526
527
528
529
530
531
/*
 *  Search and remove all stale (devices which are not mounted) devices.
 *  When both inputs are NULL, it will search and release all stale devices.
 *  path:	Optional. When provided will it release all unmounted devices
 *		matching this path only.
 *  skip_dev:	Optional. Will skip this device when searching for the stale
 *		devices.
532
533
534
 *  Return:	0 for success or if @path is NULL.
 * 		-EBUSY if @path is a mounted device.
 * 		-ENOENT if @path does not match any device in the list.
535
 */
536
static int btrfs_free_stale_devices(const char *path,
537
				     struct btrfs_device *skip_device)
Anand Jain's avatar
Anand Jain committed
538
{
539
540
	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
	struct btrfs_device *device, *tmp_device;
541
542
543
544
	int ret = 0;

	if (path)
		ret = -ENOENT;
Anand Jain's avatar
Anand Jain committed
545

546
	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
Anand Jain's avatar
Anand Jain committed
547

548
		mutex_lock(&fs_devices->device_list_mutex);
549
550
551
		list_for_each_entry_safe(device, tmp_device,
					 &fs_devices->devices, dev_list) {
			if (skip_device && skip_device == device)
552
				continue;
553
			if (path && !device->name)
Anand Jain's avatar
Anand Jain committed
554
				continue;
555
			if (path && !device_path_matched(path, device))
556
				continue;
557
558
559
560
561
562
			if (fs_devices->opened) {
				/* for an already deleted device return 0 */
				if (path && ret != 0)
					ret = -EBUSY;
				break;
			}
Anand Jain's avatar
Anand Jain committed
563
564

			/* delete the stale device */
565
566
567
568
			fs_devices->num_devices--;
			list_del(&device->dev_list);
			btrfs_free_device(device);

569
			ret = 0;
570
			if (fs_devices->num_devices == 0)
571
				break;
572
573
		}
		mutex_unlock(&fs_devices->device_list_mutex);
574

575
576
577
578
		if (fs_devices->num_devices == 0) {
			btrfs_sysfs_remove_fsid(fs_devices);
			list_del(&fs_devices->fs_list);
			free_fs_devices(fs_devices);
Anand Jain's avatar
Anand Jain committed
579
580
		}
	}
581
582

	return ret;
Anand Jain's avatar
Anand Jain committed
583
584
}

585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
			struct btrfs_device *device, fmode_t flags,
			void *holder)
{
	struct request_queue *q;
	struct block_device *bdev;
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				    &bdev, &bh);
	if (ret)
		return ret;

	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
		goto error_brelse;

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
		goto error_brelse;

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
617
618
619
620
621
622
623
		if (btrfs_super_incompat_flags(disk_super) &
		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
			pr_err(
		"BTRFS: Invalid seeding and uuid-changed device detected\n");
			goto error_brelse;
		}

624
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
625
626
		fs_devices->seeding = 1;
	} else {
627
628
629
630
		if (bdev_read_only(bdev))
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
631
632
633
634
635
636
637
	}

	q = bdev_get_queue(bdev);
	if (!blk_queue_nonrot(q))
		fs_devices->rotating = 1;

	device->bdev = bdev;
638
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
639
640
641
	device->mode = flags;

	fs_devices->open_devices++;
642
643
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
644
		fs_devices->rw_devices++;
645
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
646
647
648
649
650
651
652
653
654
655
656
657
	}
	brelse(bh);

	return 0;

error_brelse:
	brelse(bh);
	blkdev_put(bdev, flags);

	return -EINVAL;
}

658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
/*
 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 * being created with a disk that has already completed its fsid change.
 */
static struct btrfs_fs_devices *find_fsid_inprogress(
					struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
			   BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
			return fs_devices;
		}
	}

	return NULL;
}

679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702

static struct btrfs_fs_devices *find_fsid_changed(
					struct btrfs_super_block *disk_super)
{
	struct btrfs_fs_devices *fs_devices;

	/*
	 * Handles the case where scanned device is part of an fs that had
	 * multiple successful changes of FSID but curently device didn't
	 * observe it. Meaning our fsid will be different than theirs.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
			   BTRFS_FSID_SIZE) != 0 &&
		    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
			   BTRFS_FSID_SIZE) == 0 &&
		    memcmp(fs_devices->fsid, disk_super->fsid,
			   BTRFS_FSID_SIZE) != 0) {
			return fs_devices;
		}
	}

	return NULL;
}
703
704
705
706
/*
 * Add new device to list of registered devices
 *
 * Returns:
707
708
 * device pointer which was just added or updated when successful
 * error pointer when failed
709
 */
710
static noinline struct btrfs_device *device_list_add(const char *path,
711
712
			   struct btrfs_super_block *disk_super,
			   bool *new_device_added)
713
714
{
	struct btrfs_device *device;
715
	struct btrfs_fs_devices *fs_devices = NULL;
716
	struct rcu_string *name;
717
	u64 found_transid = btrfs_super_generation(disk_super);
718
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
719
720
	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
721
722
	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
723

724
725
726
727
728
729
730
731
732
733
734
735
736
737
	if (fsid_change_in_progress) {
		if (!has_metadata_uuid) {
			/*
			 * When we have an image which has CHANGING_FSID_V2 set
			 * it might belong to either a filesystem which has
			 * disks with completed fsid change or it might belong
			 * to fs with no UUID changes in effect, handle both.
			 */
			fs_devices = find_fsid_inprogress(disk_super);
			if (!fs_devices)
				fs_devices = find_fsid(disk_super->fsid, NULL);
		} else {
			fs_devices = find_fsid_changed(disk_super);
		}
738
739
740
741
	} else if (has_metadata_uuid) {
		fs_devices = find_fsid(disk_super->fsid,
				       disk_super->metadata_uuid);
	} else {
742
		fs_devices = find_fsid(disk_super->fsid, NULL);
743
744
	}

745
746

	if (!fs_devices) {
747
748
749
750
751
752
		if (has_metadata_uuid)
			fs_devices = alloc_fs_devices(disk_super->fsid,
						      disk_super->metadata_uuid);
		else
			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);

753
		if (IS_ERR(fs_devices))
754
			return ERR_CAST(fs_devices);
755

756
757
		fs_devices->fsid_change = fsid_change_in_progress;

758
		mutex_lock(&fs_devices->device_list_mutex);
759
		list_add(&fs_devices->fs_list, &fs_uuids);
760

761
762
		device = NULL;
	} else {
763
		mutex_lock(&fs_devices->device_list_mutex);
764
765
		device = btrfs_find_device(fs_devices, devid,
				disk_super->dev_item.uuid, NULL, false);
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780

		/*
		 * If this disk has been pulled into an fs devices created by
		 * a device which had the CHANGING_FSID_V2 flag then replace the
		 * metadata_uuid/fsid values of the fs_devices.
		 */
		if (has_metadata_uuid && fs_devices->fsid_change &&
		    found_transid > fs_devices->latest_generation) {
			memcpy(fs_devices->fsid, disk_super->fsid,
					BTRFS_FSID_SIZE);
			memcpy(fs_devices->metadata_uuid,
					disk_super->metadata_uuid, BTRFS_FSID_SIZE);

			fs_devices->fsid_change = false;
		}
781
	}
782

783
	if (!device) {
784
785
		if (fs_devices->opened) {
			mutex_unlock(&fs_devices->device_list_mutex);
786
			return ERR_PTR(-EBUSY);
787
		}
Yan Zheng's avatar
Yan Zheng committed
788

789
790
791
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
792
			mutex_unlock(&fs_devices->device_list_mutex);
793
			/* we can safely leave the fs_devices entry around */
794
			return device;
795
		}
796
797
798

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
799
			btrfs_free_device(device);
800
			mutex_unlock(&fs_devices->device_list_mutex);
801
			return ERR_PTR(-ENOMEM);
802
		}
803
		rcu_assign_pointer(device->name, name);
804

805
		list_add_rcu(&device->dev_list, &fs_devices->devices);
806
		fs_devices->num_devices++;
807

Yan Zheng's avatar
Yan Zheng committed
808
		device->fs_devices = fs_devices;
809
		*new_device_added = true;
810
811

		if (disk_super->label[0])
812
813
814
815
			pr_info(
	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
				disk_super->label, devid, found_transid, path,
				current->comm, task_pid_nr(current));
816
		else
817
818
819
820
			pr_info(
	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
				disk_super->fsid, devid, found_transid, path,
				current->comm, task_pid_nr(current));
821

822
	} else if (!device->name || strcmp(device->name->str, path)) {
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
844
845
846
847
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
848
		 */
849
		if (!fs_devices->opened && found_transid < device->generation) {
850
851
852
853
854
855
856
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
857
			mutex_unlock(&fs_devices->device_list_mutex);
858
			return ERR_PTR(-EEXIST);
859
		}
860

861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
		/*
		 * We are going to replace the device path for a given devid,
		 * make sure it's the same device if the device is mounted
		 */
		if (device->bdev) {
			struct block_device *path_bdev;

			path_bdev = lookup_bdev(path);
			if (IS_ERR(path_bdev)) {
				mutex_unlock(&fs_devices->device_list_mutex);
				return ERR_CAST(path_bdev);
			}

			if (device->bdev != path_bdev) {
				bdput(path_bdev);
				mutex_unlock(&fs_devices->device_list_mutex);
				btrfs_warn_in_rcu(device->fs_info,
			"duplicate device fsid:devid for %pU:%llu old:%s new:%s",
					disk_super->fsid, devid,
					rcu_str_deref(device->name), path);
				return ERR_PTR(-EEXIST);
			}
			bdput(path_bdev);
			btrfs_info_in_rcu(device->fs_info,
				"device fsid %pU devid %llu moved old:%s new:%s",
				disk_super->fsid, devid,
				rcu_str_deref(device->name), path);
		}

890
		name = rcu_string_strdup(path, GFP_NOFS);
891
892
		if (!name) {
			mutex_unlock(&fs_devices->device_list_mutex);
893
			return ERR_PTR(-ENOMEM);
894
		}
895
896
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
897
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
898
			fs_devices->missing_devices--;
899
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
900
		}
901
902
	}

903
904
905
906
907
908
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
909
	if (!fs_devices->opened) {
910
		device->generation = found_transid;
911
912
913
		fs_devices->latest_generation = max_t(u64, found_transid,
						fs_devices->latest_generation);
	}
914

915
916
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

917
	mutex_unlock(&fs_devices->device_list_mutex);
918
	return device;
919
920
}

Yan Zheng's avatar
Yan Zheng committed
921
922
923
924
925
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;
926
	int ret = 0;
Yan Zheng's avatar
Yan Zheng committed
927

928
	fs_devices = alloc_fs_devices(orig->fsid, NULL);
929
930
	if (IS_ERR(fs_devices))
		return fs_devices;
Yan Zheng's avatar
Yan Zheng committed
931

932
	mutex_lock(&orig->device_list_mutex);
Josef Bacik's avatar
Josef Bacik committed
933
	fs_devices->total_devices = orig->total_devices;
Yan Zheng's avatar
Yan Zheng committed
934
935

	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
936
937
		struct rcu_string *name;

938
939
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
940
941
		if (IS_ERR(device)) {
			ret = PTR_ERR(device);
Yan Zheng's avatar
Yan Zheng committed
942
			goto error;
943
		}
Yan Zheng's avatar
Yan Zheng committed
944

945
946
947
948
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
949
		if (orig_dev->name) {
950
951
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
952
			if (!name) {
953
				btrfs_free_device(device);
954
				ret = -ENOMEM;
955
956
957
				goto error;
			}
			rcu_assign_pointer(device->name, name);
Julia Lawall's avatar
Julia Lawall committed
958
		}
Yan Zheng's avatar
Yan Zheng committed
959
960
961
962
963

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
964
	mutex_unlock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
965
966
	return fs_devices;
error:
967
	mutex_unlock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
968
	free_fs_devices(fs_devices);
969
	return ERR_PTR(ret);
Yan Zheng's avatar
Yan Zheng committed
970
971
}

972
973
974
975
976
/*
 * After we have read the system tree and know devids belonging to
 * this filesystem, remove the device which does not belong there.
 */
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
977
{
978
	struct btrfs_device *device, *next;
979
	struct btrfs_device *latest_dev = NULL;
980

981
982
	mutex_lock(&uuid_mutex);
again:
983
	/* This is the initialized path, it is safe to release the devices. */
984
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
985
986
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
							&device->dev_state)) {
987
988
989
990
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
			     &device->dev_state) &&
			     (!latest_dev ||
			      device->generation > latest_dev->generation)) {
991
				latest_dev = device;
992
			}
Yan Zheng's avatar
Yan Zheng committed
993
			continue;
994
		}
Yan Zheng's avatar
Yan Zheng committed
995

996
997
998
999
1000
		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			/*
			 * In the first step, keep the device which has
			 * the correct fsid and the devid that is used
			 * for the dev_replace procedure.
For faster browsing, not all history is shown. View entire blame