bae1d2a7b2329f31a9491e3a957ddf985ce7d42b
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/iocontext.h>
12 #include <linux/capability.h>
13 #include <linux/ratelimit.h>
14 #include <linux/kthread.h>
15 #include <linux/raid/pq.h>
16 #include <linux/semaphore.h>
17 #include <linux/uuid.h>
18 #include <linux/list_sort.h>
19 #include <asm/div64.h>
20 #include "ctree.h"
21 #include "extent_map.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "volumes.h"
26 #include "raid56.h"
27 #include "async-thread.h"
28 #include "check-integrity.h"
29 #include "rcu-string.h"
30 #include "math.h"
31 #include "dev-replace.h"
32 #include "sysfs.h"
33
34 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
35         [BTRFS_RAID_RAID10] = {
36                 .sub_stripes    = 2,
37                 .dev_stripes    = 1,
38                 .devs_max       = 0,    /* 0 == as many as possible */
39                 .devs_min       = 4,
40                 .tolerated_failures = 1,
41                 .devs_increment = 2,
42                 .ncopies        = 2,
43         },
44         [BTRFS_RAID_RAID1] = {
45                 .sub_stripes    = 1,
46                 .dev_stripes    = 1,
47                 .devs_max       = 2,
48                 .devs_min       = 2,
49                 .tolerated_failures = 1,
50                 .devs_increment = 2,
51                 .ncopies        = 2,
52         },
53         [BTRFS_RAID_DUP] = {
54                 .sub_stripes    = 1,
55                 .dev_stripes    = 2,
56                 .devs_max       = 1,
57                 .devs_min       = 1,
58                 .tolerated_failures = 0,
59                 .devs_increment = 1,
60                 .ncopies        = 2,
61         },
62         [BTRFS_RAID_RAID0] = {
63                 .sub_stripes    = 1,
64                 .dev_stripes    = 1,
65                 .devs_max       = 0,
66                 .devs_min       = 2,
67                 .tolerated_failures = 0,
68                 .devs_increment = 1,
69                 .ncopies        = 1,
70         },
71         [BTRFS_RAID_SINGLE] = {
72                 .sub_stripes    = 1,
73                 .dev_stripes    = 1,
74                 .devs_max       = 1,
75                 .devs_min       = 1,
76                 .tolerated_failures = 0,
77                 .devs_increment = 1,
78                 .ncopies        = 1,
79         },
80         [BTRFS_RAID_RAID5] = {
81                 .sub_stripes    = 1,
82                 .dev_stripes    = 1,
83                 .devs_max       = 0,
84                 .devs_min       = 2,
85                 .tolerated_failures = 1,
86                 .devs_increment = 1,
87                 .ncopies        = 2,
88         },
89         [BTRFS_RAID_RAID6] = {
90                 .sub_stripes    = 1,
91                 .dev_stripes    = 1,
92                 .devs_max       = 0,
93                 .devs_min       = 3,
94                 .tolerated_failures = 2,
95                 .devs_increment = 1,
96                 .ncopies        = 3,
97         },
98 };
99
100 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
101         [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
102         [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
103         [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
104         [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
105         [BTRFS_RAID_SINGLE] = 0,
106         [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
107         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
108 };
109
110 /*
111  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
112  * condition is not met. Zero means there's no corresponding
113  * BTRFS_ERROR_DEV_*_NOT_MET value.
114  */
115 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
116         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
117         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
118         [BTRFS_RAID_DUP]    = 0,
119         [BTRFS_RAID_RAID0]  = 0,
120         [BTRFS_RAID_SINGLE] = 0,
121         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
122         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
123 };
124
125 static int init_first_rw_device(struct btrfs_trans_handle *trans,
126                                 struct btrfs_fs_info *fs_info);
127 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
128 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
129 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
130 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
131 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
132                              enum btrfs_map_op op,
133                              u64 logical, u64 *length,
134                              struct btrfs_bio **bbio_ret,
135                              int mirror_num, int need_raid_map);
136
137 /*
138  * Device locking
139  * ==============
140  *
141  * There are several mutexes that protect manipulation of devices and low-level
142  * structures like chunks but not block groups, extents or files
143  *
144  * uuid_mutex (global lock)
145  * ------------------------
146  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
147  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
148  * device) or requested by the device= mount option
149  *
150  * the mutex can be very coarse and can cover long-running operations
151  *
152  * protects: updates to fs_devices counters like missing devices, rw devices,
153  * seeding, structure cloning, openning/closing devices at mount/umount time
154  *
155  * global::fs_devs - add, remove, updates to the global list
156  *
157  * does not protect: manipulation of the fs_devices::devices list!
158  *
159  * btrfs_device::name - renames (write side), read is RCU
160  *
161  * fs_devices::device_list_mutex (per-fs, with RCU)
162  * ------------------------------------------------
163  * protects updates to fs_devices::devices, ie. adding and deleting
164  *
165  * simple list traversal with read-only actions can be done with RCU protection
166  *
167  * may be used to exclude some operations from running concurrently without any
168  * modifications to the list (see write_all_supers)
169  *
170  * volume_mutex
171  * ------------
172  * coarse lock owned by a mounted filesystem; used to exclude some operations
173  * that cannot run in parallel and affect the higher-level properties of the
174  * filesystem like: device add/deleting/resize/replace, or balance
175  *
176  * balance_mutex
177  * -------------
178  * protects balance structures (status, state) and context accessed from
179  * several places (internally, ioctl)
180  *
181  * chunk_mutex
182  * -----------
183  * protects chunks, adding or removing during allocation, trim or when a new
184  * device is added/removed
185  *
186  * cleaner_mutex
187  * -------------
188  * a big lock that is held by the cleaner thread and prevents running subvolume
189  * cleaning together with relocation or delayed iputs
190  *
191  *
192  * Lock nesting
193  * ============
194  *
195  * uuid_mutex
196  *   volume_mutex
197  *     device_list_mutex
198  *       chunk_mutex
199  *     balance_mutex
200  *
201  *
202  * Exclusive operations, BTRFS_FS_EXCL_OP
203  * ======================================
204  *
205  * Maintains the exclusivity of the following operations that apply to the
206  * whole filesystem and cannot run in parallel.
207  *
208  * - Balance (*)
209  * - Device add
210  * - Device remove
211  * - Device replace (*)
212  * - Resize
213  *
214  * The device operations (as above) can be in one of the following states:
215  *
216  * - Running state
217  * - Paused state
218  * - Completed state
219  *
220  * Only device operations marked with (*) can go into the Paused state for the
221  * following reasons:
222  *
223  * - ioctl (only Balance can be Paused through ioctl)
224  * - filesystem remounted as read-only
225  * - filesystem unmounted and mounted as read-only
226  * - system power-cycle and filesystem mounted as read-only
227  * - filesystem or device errors leading to forced read-only
228  *
229  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
230  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
231  * A device operation in Paused or Running state can be canceled or resumed
232  * either by ioctl (Balance only) or when remounted as read-write.
233  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
234  * completed.
235  */
236
237 DEFINE_MUTEX(uuid_mutex);
238 static LIST_HEAD(fs_uuids);
239 struct list_head *btrfs_get_fs_uuids(void)
240 {
241         return &fs_uuids;
242 }
243
244 /*
245  * alloc_fs_devices - allocate struct btrfs_fs_devices
246  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
247  *
248  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
249  * The returned struct is not linked onto any lists and can be destroyed with
250  * kfree() right away.
251  */
252 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
253 {
254         struct btrfs_fs_devices *fs_devs;
255
256         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
257         if (!fs_devs)
258                 return ERR_PTR(-ENOMEM);
259
260         mutex_init(&fs_devs->device_list_mutex);
261
262         INIT_LIST_HEAD(&fs_devs->devices);
263         INIT_LIST_HEAD(&fs_devs->resized_devices);
264         INIT_LIST_HEAD(&fs_devs->alloc_list);
265         INIT_LIST_HEAD(&fs_devs->fs_list);
266         if (fsid)
267                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
268
269         return fs_devs;
270 }
271
272 static void free_device(struct btrfs_device *device)
273 {
274         rcu_string_free(device->name);
275         bio_put(device->flush_bio);
276         kfree(device);
277 }
278
279 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
280 {
281         struct btrfs_device *device;
282         WARN_ON(fs_devices->opened);
283         while (!list_empty(&fs_devices->devices)) {
284                 device = list_entry(fs_devices->devices.next,
285                                     struct btrfs_device, dev_list);
286                 list_del(&device->dev_list);
287                 free_device(device);
288         }
289         kfree(fs_devices);
290 }
291
292 static void btrfs_kobject_uevent(struct block_device *bdev,
293                                  enum kobject_action action)
294 {
295         int ret;
296
297         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
298         if (ret)
299                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
300                         action,
301                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
302                         &disk_to_dev(bdev->bd_disk)->kobj);
303 }
304
305 void __exit btrfs_cleanup_fs_uuids(void)
306 {
307         struct btrfs_fs_devices *fs_devices;
308
309         while (!list_empty(&fs_uuids)) {
310                 fs_devices = list_entry(fs_uuids.next,
311                                         struct btrfs_fs_devices, fs_list);
312                 list_del(&fs_devices->fs_list);
313                 free_fs_devices(fs_devices);
314         }
315 }
316
317 /*
318  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
319  * Returned struct is not linked onto any lists and must be destroyed using
320  * free_device.
321  */
322 static struct btrfs_device *__alloc_device(void)
323 {
324         struct btrfs_device *dev;
325
326         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
327         if (!dev)
328                 return ERR_PTR(-ENOMEM);
329
330         /*
331          * Preallocate a bio that's always going to be used for flushing device
332          * barriers and matches the device lifespan
333          */
334         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
335         if (!dev->flush_bio) {
336                 kfree(dev);
337                 return ERR_PTR(-ENOMEM);
338         }
339
340         INIT_LIST_HEAD(&dev->dev_list);
341         INIT_LIST_HEAD(&dev->dev_alloc_list);
342         INIT_LIST_HEAD(&dev->resized_list);
343
344         spin_lock_init(&dev->io_lock);
345
346         atomic_set(&dev->reada_in_flight, 0);
347         atomic_set(&dev->dev_stats_ccnt, 0);
348         btrfs_device_data_ordered_init(dev);
349         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
350         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
351
352         return dev;
353 }
354
355 /*
356  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
357  * return NULL.
358  *
359  * If devid and uuid are both specified, the match must be exact, otherwise
360  * only devid is used.
361  */
362 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
363                 u64 devid, const u8 *uuid)
364 {
365         struct btrfs_device *dev;
366
367         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
368                 if (dev->devid == devid &&
369                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
370                         return dev;
371                 }
372         }
373         return NULL;
374 }
375
376 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
377 {
378         struct btrfs_fs_devices *fs_devices;
379
380         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
381                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
382                         return fs_devices;
383         }
384         return NULL;
385 }
386
387 static int
388 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
389                       int flush, struct block_device **bdev,
390                       struct buffer_head **bh)
391 {
392         int ret;
393
394         *bdev = blkdev_get_by_path(device_path, flags, holder);
395
396         if (IS_ERR(*bdev)) {
397                 ret = PTR_ERR(*bdev);
398                 goto error;
399         }
400
401         if (flush)
402                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
403         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
404         if (ret) {
405                 blkdev_put(*bdev, flags);
406                 goto error;
407         }
408         invalidate_bdev(*bdev);
409         *bh = btrfs_read_dev_super(*bdev);
410         if (IS_ERR(*bh)) {
411                 ret = PTR_ERR(*bh);
412                 blkdev_put(*bdev, flags);
413                 goto error;
414         }
415
416         return 0;
417
418 error:
419         *bdev = NULL;
420         *bh = NULL;
421         return ret;
422 }
423
424 static void requeue_list(struct btrfs_pending_bios *pending_bios,
425                         struct bio *head, struct bio *tail)
426 {
427
428         struct bio *old_head;
429
430         old_head = pending_bios->head;
431         pending_bios->head = head;
432         if (pending_bios->tail)
433                 tail->bi_next = old_head;
434         else
435                 pending_bios->tail = tail;
436 }
437
438 /*
439  * we try to collect pending bios for a device so we don't get a large
440  * number of procs sending bios down to the same device.  This greatly
441  * improves the schedulers ability to collect and merge the bios.
442  *
443  * But, it also turns into a long list of bios to process and that is sure
444  * to eventually make the worker thread block.  The solution here is to
445  * make some progress and then put this work struct back at the end of
446  * the list if the block device is congested.  This way, multiple devices
447  * can make progress from a single worker thread.
448  */
449 static noinline void run_scheduled_bios(struct btrfs_device *device)
450 {
451         struct btrfs_fs_info *fs_info = device->fs_info;
452         struct bio *pending;
453         struct backing_dev_info *bdi;
454         struct btrfs_pending_bios *pending_bios;
455         struct bio *tail;
456         struct bio *cur;
457         int again = 0;
458         unsigned long num_run;
459         unsigned long batch_run = 0;
460         unsigned long last_waited = 0;
461         int force_reg = 0;
462         int sync_pending = 0;
463         struct blk_plug plug;
464
465         /*
466          * this function runs all the bios we've collected for
467          * a particular device.  We don't want to wander off to
468          * another device without first sending all of these down.
469          * So, setup a plug here and finish it off before we return
470          */
471         blk_start_plug(&plug);
472
473         bdi = device->bdev->bd_bdi;
474
475 loop:
476         spin_lock(&device->io_lock);
477
478 loop_lock:
479         num_run = 0;
480
481         /* take all the bios off the list at once and process them
482          * later on (without the lock held).  But, remember the
483          * tail and other pointers so the bios can be properly reinserted
484          * into the list if we hit congestion
485          */
486         if (!force_reg && device->pending_sync_bios.head) {
487                 pending_bios = &device->pending_sync_bios;
488                 force_reg = 1;
489         } else {
490                 pending_bios = &device->pending_bios;
491                 force_reg = 0;
492         }
493
494         pending = pending_bios->head;
495         tail = pending_bios->tail;
496         WARN_ON(pending && !tail);
497
498         /*
499          * if pending was null this time around, no bios need processing
500          * at all and we can stop.  Otherwise it'll loop back up again
501          * and do an additional check so no bios are missed.
502          *
503          * device->running_pending is used to synchronize with the
504          * schedule_bio code.
505          */
506         if (device->pending_sync_bios.head == NULL &&
507             device->pending_bios.head == NULL) {
508                 again = 0;
509                 device->running_pending = 0;
510         } else {
511                 again = 1;
512                 device->running_pending = 1;
513         }
514
515         pending_bios->head = NULL;
516         pending_bios->tail = NULL;
517
518         spin_unlock(&device->io_lock);
519
520         while (pending) {
521
522                 rmb();
523                 /* we want to work on both lists, but do more bios on the
524                  * sync list than the regular list
525                  */
526                 if ((num_run > 32 &&
527                     pending_bios != &device->pending_sync_bios &&
528                     device->pending_sync_bios.head) ||
529                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
530                     device->pending_bios.head)) {
531                         spin_lock(&device->io_lock);
532                         requeue_list(pending_bios, pending, tail);
533                         goto loop_lock;
534                 }
535
536                 cur = pending;
537                 pending = pending->bi_next;
538                 cur->bi_next = NULL;
539
540                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
541
542                 /*
543                  * if we're doing the sync list, record that our
544                  * plug has some sync requests on it
545                  *
546                  * If we're doing the regular list and there are
547                  * sync requests sitting around, unplug before
548                  * we add more
549                  */
550                 if (pending_bios == &device->pending_sync_bios) {
551                         sync_pending = 1;
552                 } else if (sync_pending) {
553                         blk_finish_plug(&plug);
554                         blk_start_plug(&plug);
555                         sync_pending = 0;
556                 }
557
558                 btrfsic_submit_bio(cur);
559                 num_run++;
560                 batch_run++;
561
562                 cond_resched();
563
564                 /*
565                  * we made progress, there is more work to do and the bdi
566                  * is now congested.  Back off and let other work structs
567                  * run instead
568                  */
569                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
570                     fs_info->fs_devices->open_devices > 1) {
571                         struct io_context *ioc;
572
573                         ioc = current->io_context;
574
575                         /*
576                          * the main goal here is that we don't want to
577                          * block if we're going to be able to submit
578                          * more requests without blocking.
579                          *
580                          * This code does two great things, it pokes into
581                          * the elevator code from a filesystem _and_
582                          * it makes assumptions about how batching works.
583                          */
584                         if (ioc && ioc->nr_batch_requests > 0 &&
585                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
586                             (last_waited == 0 ||
587                              ioc->last_waited == last_waited)) {
588                                 /*
589                                  * we want to go through our batch of
590                                  * requests and stop.  So, we copy out
591                                  * the ioc->last_waited time and test
592                                  * against it before looping
593                                  */
594                                 last_waited = ioc->last_waited;
595                                 cond_resched();
596                                 continue;
597                         }
598                         spin_lock(&device->io_lock);
599                         requeue_list(pending_bios, pending, tail);
600                         device->running_pending = 1;
601
602                         spin_unlock(&device->io_lock);
603                         btrfs_queue_work(fs_info->submit_workers,
604                                          &device->work);
605                         goto done;
606                 }
607         }
608
609         cond_resched();
610         if (again)
611                 goto loop;
612
613         spin_lock(&device->io_lock);
614         if (device->pending_bios.head || device->pending_sync_bios.head)
615                 goto loop_lock;
616         spin_unlock(&device->io_lock);
617
618 done:
619         blk_finish_plug(&plug);
620 }
621
622 static void pending_bios_fn(struct btrfs_work *work)
623 {
624         struct btrfs_device *device;
625
626         device = container_of(work, struct btrfs_device, work);
627         run_scheduled_bios(device);
628 }
629
630 /*
631  *  Search and remove all stale (devices which are not mounted) devices.
632  *  When both inputs are NULL, it will search and release all stale devices.
633  *  path:       Optional. When provided will it release all unmounted devices
634  *              matching this path only.
635  *  skip_dev:   Optional. Will skip this device when searching for the stale
636  *              devices.
637  */
638 static void btrfs_free_stale_devices(const char *path,
639                                      struct btrfs_device *skip_dev)
640 {
641         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
642         struct btrfs_device *dev, *tmp_dev;
643
644         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
645
646                 if (fs_devs->opened)
647                         continue;
648
649                 list_for_each_entry_safe(dev, tmp_dev,
650                                          &fs_devs->devices, dev_list) {
651                         int not_found = 0;
652
653                         if (skip_dev && skip_dev == dev)
654                                 continue;
655                         if (path && !dev->name)
656                                 continue;
657
658                         rcu_read_lock();
659                         if (path)
660                                 not_found = strcmp(rcu_str_deref(dev->name),
661                                                    path);
662                         rcu_read_unlock();
663                         if (not_found)
664                                 continue;
665
666                         /* delete the stale device */
667                         if (fs_devs->num_devices == 1) {
668                                 btrfs_sysfs_remove_fsid(fs_devs);
669                                 list_del(&fs_devs->fs_list);
670                                 free_fs_devices(fs_devs);
671                                 break;
672                         } else {
673                                 fs_devs->num_devices--;
674                                 list_del(&dev->dev_list);
675                                 free_device(dev);
676                         }
677                 }
678         }
679 }
680
681 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
682                         struct btrfs_device *device, fmode_t flags,
683                         void *holder)
684 {
685         struct request_queue *q;
686         struct block_device *bdev;
687         struct buffer_head *bh;
688         struct btrfs_super_block *disk_super;
689         u64 devid;
690         int ret;
691
692         if (device->bdev)
693                 return -EINVAL;
694         if (!device->name)
695                 return -EINVAL;
696
697         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
698                                     &bdev, &bh);
699         if (ret)
700                 return ret;
701
702         disk_super = (struct btrfs_super_block *)bh->b_data;
703         devid = btrfs_stack_device_id(&disk_super->dev_item);
704         if (devid != device->devid)
705                 goto error_brelse;
706
707         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
708                 goto error_brelse;
709
710         device->generation = btrfs_super_generation(disk_super);
711
712         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
713                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
714                 fs_devices->seeding = 1;
715         } else {
716                 if (bdev_read_only(bdev))
717                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
718                 else
719                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
720         }
721
722         q = bdev_get_queue(bdev);
723         if (!blk_queue_nonrot(q))
724                 fs_devices->rotating = 1;
725
726         device->bdev = bdev;
727         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
728         device->mode = flags;
729
730         fs_devices->open_devices++;
731         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
732             device->devid != BTRFS_DEV_REPLACE_DEVID) {
733                 fs_devices->rw_devices++;
734                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
735         }
736         brelse(bh);
737
738         return 0;
739
740 error_brelse:
741         brelse(bh);
742         blkdev_put(bdev, flags);
743
744         return -EINVAL;
745 }
746
747 /*
748  * Add new device to list of registered devices
749  *
750  * Returns:
751  * device pointer which was just added or updated when successful
752  * error pointer when failed
753  */
754 static noinline struct btrfs_device *device_list_add(const char *path,
755                            struct btrfs_super_block *disk_super)
756 {
757         struct btrfs_device *device;
758         struct btrfs_fs_devices *fs_devices;
759         struct rcu_string *name;
760         u64 found_transid = btrfs_super_generation(disk_super);
761         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
762
763         fs_devices = find_fsid(disk_super->fsid);
764         if (!fs_devices) {
765                 fs_devices = alloc_fs_devices(disk_super->fsid);
766                 if (IS_ERR(fs_devices))
767                         return ERR_CAST(fs_devices);
768
769                 list_add(&fs_devices->fs_list, &fs_uuids);
770
771                 device = NULL;
772         } else {
773                 device = find_device(fs_devices, devid,
774                                 disk_super->dev_item.uuid);
775         }
776
777         if (!device) {
778                 if (fs_devices->opened)
779                         return ERR_PTR(-EBUSY);
780
781                 device = btrfs_alloc_device(NULL, &devid,
782                                             disk_super->dev_item.uuid);
783                 if (IS_ERR(device)) {
784                         /* we can safely leave the fs_devices entry around */
785                         return device;
786                 }
787
788                 name = rcu_string_strdup(path, GFP_NOFS);
789                 if (!name) {
790                         free_device(device);
791                         return ERR_PTR(-ENOMEM);
792                 }
793                 rcu_assign_pointer(device->name, name);
794
795                 mutex_lock(&fs_devices->device_list_mutex);
796                 list_add_rcu(&device->dev_list, &fs_devices->devices);
797                 fs_devices->num_devices++;
798                 mutex_unlock(&fs_devices->device_list_mutex);
799
800                 device->fs_devices = fs_devices;
801                 btrfs_free_stale_devices(path, device);
802
803                 if (disk_super->label[0])
804                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
805                                 disk_super->label, devid, found_transid, path);
806                 else
807                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
808                                 disk_super->fsid, devid, found_transid, path);
809
810         } else if (!device->name || strcmp(device->name->str, path)) {
811                 /*
812                  * When FS is already mounted.
813                  * 1. If you are here and if the device->name is NULL that
814                  *    means this device was missing at time of FS mount.
815                  * 2. If you are here and if the device->name is different
816                  *    from 'path' that means either
817                  *      a. The same device disappeared and reappeared with
818                  *         different name. or
819                  *      b. The missing-disk-which-was-replaced, has
820                  *         reappeared now.
821                  *
822                  * We must allow 1 and 2a above. But 2b would be a spurious
823                  * and unintentional.
824                  *
825                  * Further in case of 1 and 2a above, the disk at 'path'
826                  * would have missed some transaction when it was away and
827                  * in case of 2a the stale bdev has to be updated as well.
828                  * 2b must not be allowed at all time.
829                  */
830
831                 /*
832                  * For now, we do allow update to btrfs_fs_device through the
833                  * btrfs dev scan cli after FS has been mounted.  We're still
834                  * tracking a problem where systems fail mount by subvolume id
835                  * when we reject replacement on a mounted FS.
836                  */
837                 if (!fs_devices->opened && found_transid < device->generation) {
838                         /*
839                          * That is if the FS is _not_ mounted and if you
840                          * are here, that means there is more than one
841                          * disk with same uuid and devid.We keep the one
842                          * with larger generation number or the last-in if
843                          * generation are equal.
844                          */
845                         return ERR_PTR(-EEXIST);
846                 }
847
848                 name = rcu_string_strdup(path, GFP_NOFS);
849                 if (!name)
850                         return ERR_PTR(-ENOMEM);
851                 rcu_string_free(device->name);
852                 rcu_assign_pointer(device->name, name);
853                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
854                         fs_devices->missing_devices--;
855                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
856                 }
857         }
858
859         /*
860          * Unmount does not free the btrfs_device struct but would zero
861          * generation along with most of the other members. So just update
862          * it back. We need it to pick the disk with largest generation
863          * (as above).
864          */
865         if (!fs_devices->opened)
866                 device->generation = found_transid;
867
868         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
869
870         return device;
871 }
872
873 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
874 {
875         struct btrfs_fs_devices *fs_devices;
876         struct btrfs_device *device;
877         struct btrfs_device *orig_dev;
878
879         fs_devices = alloc_fs_devices(orig->fsid);
880         if (IS_ERR(fs_devices))
881                 return fs_devices;
882
883         mutex_lock(&orig->device_list_mutex);
884         fs_devices->total_devices = orig->total_devices;
885
886         /* We have held the volume lock, it is safe to get the devices. */
887         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
888                 struct rcu_string *name;
889
890                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
891                                             orig_dev->uuid);
892                 if (IS_ERR(device))
893                         goto error;
894
895                 /*
896                  * This is ok to do without rcu read locked because we hold the
897                  * uuid mutex so nothing we touch in here is going to disappear.
898                  */
899                 if (orig_dev->name) {
900                         name = rcu_string_strdup(orig_dev->name->str,
901                                         GFP_KERNEL);
902                         if (!name) {
903                                 free_device(device);
904                                 goto error;
905                         }
906                         rcu_assign_pointer(device->name, name);
907                 }
908
909                 list_add(&device->dev_list, &fs_devices->devices);
910                 device->fs_devices = fs_devices;
911                 fs_devices->num_devices++;
912         }
913         mutex_unlock(&orig->device_list_mutex);
914         return fs_devices;
915 error:
916         mutex_unlock(&orig->device_list_mutex);
917         free_fs_devices(fs_devices);
918         return ERR_PTR(-ENOMEM);
919 }
920
921 /*
922  * After we have read the system tree and know devids belonging to
923  * this filesystem, remove the device which does not belong there.
924  */
925 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
926 {
927         struct btrfs_device *device, *next;
928         struct btrfs_device *latest_dev = NULL;
929
930         mutex_lock(&uuid_mutex);
931 again:
932         /* This is the initialized path, it is safe to release the devices. */
933         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
934                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
935                                                         &device->dev_state)) {
936                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
937                              &device->dev_state) &&
938                              (!latest_dev ||
939                               device->generation > latest_dev->generation)) {
940                                 latest_dev = device;
941                         }
942                         continue;
943                 }
944
945                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
946                         /*
947                          * In the first step, keep the device which has
948                          * the correct fsid and the devid that is used
949                          * for the dev_replace procedure.
950                          * In the second step, the dev_replace state is
951                          * read from the device tree and it is known
952                          * whether the procedure is really active or
953                          * not, which means whether this device is
954                          * used or whether it should be removed.
955                          */
956                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
957                                                   &device->dev_state)) {
958                                 continue;
959                         }
960                 }
961                 if (device->bdev) {
962                         blkdev_put(device->bdev, device->mode);
963                         device->bdev = NULL;
964                         fs_devices->open_devices--;
965                 }
966                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
967                         list_del_init(&device->dev_alloc_list);
968                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
969                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
970                                       &device->dev_state))
971                                 fs_devices->rw_devices--;
972                 }
973                 list_del_init(&device->dev_list);
974                 fs_devices->num_devices--;
975                 free_device(device);
976         }
977
978         if (fs_devices->seed) {
979                 fs_devices = fs_devices->seed;
980                 goto again;
981         }
982
983         fs_devices->latest_bdev = latest_dev->bdev;
984
985         mutex_unlock(&uuid_mutex);
986 }
987
988 static void free_device_rcu(struct rcu_head *head)
989 {
990         struct btrfs_device *device;
991
992         device = container_of(head, struct btrfs_device, rcu);
993         free_device(device);
994 }
995
996 static void btrfs_close_bdev(struct btrfs_device *device)
997 {
998         if (!device->bdev)
999                 return;
1000
1001         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1002                 sync_blockdev(device->bdev);
1003                 invalidate_bdev(device->bdev);
1004         }
1005
1006         blkdev_put(device->bdev, device->mode);
1007 }
1008
1009 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1010 {
1011         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1012         struct btrfs_device *new_device;
1013         struct rcu_string *name;
1014
1015         if (device->bdev)
1016                 fs_devices->open_devices--;
1017
1018         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1019             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1020                 list_del_init(&device->dev_alloc_list);
1021                 fs_devices->rw_devices--;
1022         }
1023
1024         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1025                 fs_devices->missing_devices--;
1026
1027         new_device = btrfs_alloc_device(NULL, &device->devid,
1028                                         device->uuid);
1029         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1030
1031         /* Safe because we are under uuid_mutex */
1032         if (device->name) {
1033                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1034                 BUG_ON(!name); /* -ENOMEM */
1035                 rcu_assign_pointer(new_device->name, name);
1036         }
1037
1038         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1039         new_device->fs_devices = device->fs_devices;
1040 }
1041
1042 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1043 {
1044         struct btrfs_device *device, *tmp;
1045         struct list_head pending_put;
1046
1047         INIT_LIST_HEAD(&pending_put);
1048
1049         if (--fs_devices->opened > 0)
1050                 return 0;
1051
1052         mutex_lock(&fs_devices->device_list_mutex);
1053         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1054                 btrfs_prepare_close_one_device(device);
1055                 list_add(&device->dev_list, &pending_put);
1056         }
1057         mutex_unlock(&fs_devices->device_list_mutex);
1058
1059         /*
1060          * btrfs_show_devname() is using the device_list_mutex,
1061          * sometimes call to blkdev_put() leads vfs calling
1062          * into this func. So do put outside of device_list_mutex,
1063          * as of now.
1064          */
1065         while (!list_empty(&pending_put)) {
1066                 device = list_first_entry(&pending_put,
1067                                 struct btrfs_device, dev_list);
1068                 list_del(&device->dev_list);
1069                 btrfs_close_bdev(device);
1070                 call_rcu(&device->rcu, free_device_rcu);
1071         }
1072
1073         WARN_ON(fs_devices->open_devices);
1074         WARN_ON(fs_devices->rw_devices);
1075         fs_devices->opened = 0;
1076         fs_devices->seeding = 0;
1077
1078         return 0;
1079 }
1080
1081 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1082 {
1083         struct btrfs_fs_devices *seed_devices = NULL;
1084         int ret;
1085
1086         mutex_lock(&uuid_mutex);
1087         ret = close_fs_devices(fs_devices);
1088         if (!fs_devices->opened) {
1089                 seed_devices = fs_devices->seed;
1090                 fs_devices->seed = NULL;
1091         }
1092         mutex_unlock(&uuid_mutex);
1093
1094         while (seed_devices) {
1095                 fs_devices = seed_devices;
1096                 seed_devices = fs_devices->seed;
1097                 close_fs_devices(fs_devices);
1098                 free_fs_devices(fs_devices);
1099         }
1100         return ret;
1101 }
1102
1103 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1104                                 fmode_t flags, void *holder)
1105 {
1106         struct btrfs_device *device;
1107         struct btrfs_device *latest_dev = NULL;
1108         int ret = 0;
1109
1110         flags |= FMODE_EXCL;
1111
1112         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1113                 /* Just open everything we can; ignore failures here */
1114                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1115                         continue;
1116
1117                 if (!latest_dev ||
1118                     device->generation > latest_dev->generation)
1119                         latest_dev = device;
1120         }
1121         if (fs_devices->open_devices == 0) {
1122                 ret = -EINVAL;
1123                 goto out;
1124         }
1125         fs_devices->opened = 1;
1126         fs_devices->latest_bdev = latest_dev->bdev;
1127         fs_devices->total_rw_bytes = 0;
1128 out:
1129         return ret;
1130 }
1131
1132 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1133 {
1134         struct btrfs_device *dev1, *dev2;
1135
1136         dev1 = list_entry(a, struct btrfs_device, dev_list);
1137         dev2 = list_entry(b, struct btrfs_device, dev_list);
1138
1139         if (dev1->devid < dev2->devid)
1140                 return -1;
1141         else if (dev1->devid > dev2->devid)
1142                 return 1;
1143         return 0;
1144 }
1145
1146 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1147                        fmode_t flags, void *holder)
1148 {
1149         int ret;
1150
1151         mutex_lock(&uuid_mutex);
1152         if (fs_devices->opened) {
1153                 fs_devices->opened++;
1154                 ret = 0;
1155         } else {
1156                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1157                 ret = open_fs_devices(fs_devices, flags, holder);
1158         }
1159         mutex_unlock(&uuid_mutex);
1160         return ret;
1161 }
1162
1163 static void btrfs_release_disk_super(struct page *page)
1164 {
1165         kunmap(page);
1166         put_page(page);
1167 }
1168
1169 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1170                                  struct page **page,
1171                                  struct btrfs_super_block **disk_super)
1172 {
1173         void *p;
1174         pgoff_t index;
1175
1176         /* make sure our super fits in the device */
1177         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1178                 return 1;
1179
1180         /* make sure our super fits in the page */
1181         if (sizeof(**disk_super) > PAGE_SIZE)
1182                 return 1;
1183
1184         /* make sure our super doesn't straddle pages on disk */
1185         index = bytenr >> PAGE_SHIFT;
1186         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1187                 return 1;
1188
1189         /* pull in the page with our super */
1190         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1191                                    index, GFP_KERNEL);
1192
1193         if (IS_ERR_OR_NULL(*page))
1194                 return 1;
1195
1196         p = kmap(*page);
1197
1198         /* align our pointer to the offset of the super block */
1199         *disk_super = p + (bytenr & ~PAGE_MASK);
1200
1201         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1202             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1203                 btrfs_release_disk_super(*page);
1204                 return 1;
1205         }
1206
1207         if ((*disk_super)->label[0] &&
1208                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1209                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1210
1211         return 0;
1212 }
1213
1214 /*
1215  * Look for a btrfs signature on a device. This may be called out of the mount path
1216  * and we are not allowed to call set_blocksize during the scan. The superblock
1217  * is read via pagecache
1218  */
1219 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1220                           struct btrfs_fs_devices **fs_devices_ret)
1221 {
1222         struct btrfs_super_block *disk_super;
1223         struct btrfs_device *device;
1224         struct block_device *bdev;
1225         struct page *page;
1226         int ret = 0;
1227         u64 bytenr;
1228
1229         /*
1230          * we would like to check all the supers, but that would make
1231          * a btrfs mount succeed after a mkfs from a different FS.
1232          * So, we need to add a special mount option to scan for
1233          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1234          */
1235         bytenr = btrfs_sb_offset(0);
1236         flags |= FMODE_EXCL;
1237         mutex_lock(&uuid_mutex);
1238
1239         bdev = blkdev_get_by_path(path, flags, holder);
1240         if (IS_ERR(bdev)) {
1241                 ret = PTR_ERR(bdev);
1242                 goto error;
1243         }
1244
1245         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1246                 ret = -EINVAL;
1247                 goto error_bdev_put;
1248         }
1249
1250         device = device_list_add(path, disk_super);
1251         if (IS_ERR(device))
1252                 ret = PTR_ERR(device);
1253         else
1254                 *fs_devices_ret = device->fs_devices;
1255
1256         btrfs_release_disk_super(page);
1257
1258 error_bdev_put:
1259         blkdev_put(bdev, flags);
1260 error:
1261         mutex_unlock(&uuid_mutex);
1262         return ret;
1263 }
1264
1265 /* helper to account the used device space in the range */
1266 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1267                                    u64 end, u64 *length)
1268 {
1269         struct btrfs_key key;
1270         struct btrfs_root *root = device->fs_info->dev_root;
1271         struct btrfs_dev_extent *dev_extent;
1272         struct btrfs_path *path;
1273         u64 extent_end;
1274         int ret;
1275         int slot;
1276         struct extent_buffer *l;
1277
1278         *length = 0;
1279
1280         if (start >= device->total_bytes ||
1281                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1282                 return 0;
1283
1284         path = btrfs_alloc_path();
1285         if (!path)
1286                 return -ENOMEM;
1287         path->reada = READA_FORWARD;
1288
1289         key.objectid = device->devid;
1290         key.offset = start;
1291         key.type = BTRFS_DEV_EXTENT_KEY;
1292
1293         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1294         if (ret < 0)
1295                 goto out;
1296         if (ret > 0) {
1297                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1298                 if (ret < 0)
1299                         goto out;
1300         }
1301
1302         while (1) {
1303                 l = path->nodes[0];
1304                 slot = path->slots[0];
1305                 if (slot >= btrfs_header_nritems(l)) {
1306                         ret = btrfs_next_leaf(root, path);
1307                         if (ret == 0)
1308                                 continue;
1309                         if (ret < 0)
1310                                 goto out;
1311
1312                         break;
1313                 }
1314                 btrfs_item_key_to_cpu(l, &key, slot);
1315
1316                 if (key.objectid < device->devid)
1317                         goto next;
1318
1319                 if (key.objectid > device->devid)
1320                         break;
1321
1322                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1323                         goto next;
1324
1325                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1326                 extent_end = key.offset + btrfs_dev_extent_length(l,
1327                                                                   dev_extent);
1328                 if (key.offset <= start && extent_end > end) {
1329                         *length = end - start + 1;
1330                         break;
1331                 } else if (key.offset <= start && extent_end > start)
1332                         *length += extent_end - start;
1333                 else if (key.offset > start && extent_end <= end)
1334                         *length += extent_end - key.offset;
1335                 else if (key.offset > start && key.offset <= end) {
1336                         *length += end - key.offset + 1;
1337                         break;
1338                 } else if (key.offset > end)
1339                         break;
1340
1341 next:
1342                 path->slots[0]++;
1343         }
1344         ret = 0;
1345 out:
1346         btrfs_free_path(path);
1347         return ret;
1348 }
1349
1350 static int contains_pending_extent(struct btrfs_transaction *transaction,
1351                                    struct btrfs_device *device,
1352                                    u64 *start, u64 len)
1353 {
1354         struct btrfs_fs_info *fs_info = device->fs_info;
1355         struct extent_map *em;
1356         struct list_head *search_list = &fs_info->pinned_chunks;
1357         int ret = 0;
1358         u64 physical_start = *start;
1359
1360         if (transaction)
1361                 search_list = &transaction->pending_chunks;
1362 again:
1363         list_for_each_entry(em, search_list, list) {
1364                 struct map_lookup *map;
1365                 int i;
1366
1367                 map = em->map_lookup;
1368                 for (i = 0; i < map->num_stripes; i++) {
1369                         u64 end;
1370
1371                         if (map->stripes[i].dev != device)
1372                                 continue;
1373                         if (map->stripes[i].physical >= physical_start + len ||
1374                             map->stripes[i].physical + em->orig_block_len <=
1375                             physical_start)
1376                                 continue;
1377                         /*
1378                          * Make sure that while processing the pinned list we do
1379                          * not override our *start with a lower value, because
1380                          * we can have pinned chunks that fall within this
1381                          * device hole and that have lower physical addresses
1382                          * than the pending chunks we processed before. If we
1383                          * do not take this special care we can end up getting
1384                          * 2 pending chunks that start at the same physical
1385                          * device offsets because the end offset of a pinned
1386                          * chunk can be equal to the start offset of some
1387                          * pending chunk.
1388                          */
1389                         end = map->stripes[i].physical + em->orig_block_len;
1390                         if (end > *start) {
1391                                 *start = end;
1392                                 ret = 1;
1393                         }
1394                 }
1395         }
1396         if (search_list != &fs_info->pinned_chunks) {
1397                 search_list = &fs_info->pinned_chunks;
1398                 goto again;
1399         }
1400
1401         return ret;
1402 }
1403
1404
1405 /*
1406  * find_free_dev_extent_start - find free space in the specified device
1407  * @device:       the device which we search the free space in
1408  * @num_bytes:    the size of the free space that we need
1409  * @search_start: the position from which to begin the search
1410  * @start:        store the start of the free space.
1411  * @len:          the size of the free space. that we find, or the size
1412  *                of the max free space if we don't find suitable free space
1413  *
1414  * this uses a pretty simple search, the expectation is that it is
1415  * called very infrequently and that a given device has a small number
1416  * of extents
1417  *
1418  * @start is used to store the start of the free space if we find. But if we
1419  * don't find suitable free space, it will be used to store the start position
1420  * of the max free space.
1421  *
1422  * @len is used to store the size of the free space that we find.
1423  * But if we don't find suitable free space, it is used to store the size of
1424  * the max free space.
1425  */
1426 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1427                                struct btrfs_device *device, u64 num_bytes,
1428                                u64 search_start, u64 *start, u64 *len)
1429 {
1430         struct btrfs_fs_info *fs_info = device->fs_info;
1431         struct btrfs_root *root = fs_info->dev_root;
1432         struct btrfs_key key;
1433         struct btrfs_dev_extent *dev_extent;
1434         struct btrfs_path *path;
1435         u64 hole_size;
1436         u64 max_hole_start;
1437         u64 max_hole_size;
1438         u64 extent_end;
1439         u64 search_end = device->total_bytes;
1440         int ret;
1441         int slot;
1442         struct extent_buffer *l;
1443
1444         /*
1445          * We don't want to overwrite the superblock on the drive nor any area
1446          * used by the boot loader (grub for example), so we make sure to start
1447          * at an offset of at least 1MB.
1448          */
1449         search_start = max_t(u64, search_start, SZ_1M);
1450
1451         path = btrfs_alloc_path();
1452         if (!path)
1453                 return -ENOMEM;
1454
1455         max_hole_start = search_start;
1456         max_hole_size = 0;
1457
1458 again:
1459         if (search_start >= search_end ||
1460                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1461                 ret = -ENOSPC;
1462                 goto out;
1463         }
1464
1465         path->reada = READA_FORWARD;
1466         path->search_commit_root = 1;
1467         path->skip_locking = 1;
1468
1469         key.objectid = device->devid;
1470         key.offset = search_start;
1471         key.type = BTRFS_DEV_EXTENT_KEY;
1472
1473         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1474         if (ret < 0)
1475                 goto out;
1476         if (ret > 0) {
1477                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1478                 if (ret < 0)
1479                         goto out;
1480         }
1481
1482         while (1) {
1483                 l = path->nodes[0];
1484                 slot = path->slots[0];
1485                 if (slot >= btrfs_header_nritems(l)) {
1486                         ret = btrfs_next_leaf(root, path);
1487                         if (ret == 0)
1488                                 continue;
1489                         if (ret < 0)
1490                                 goto out;
1491
1492                         break;
1493                 }
1494                 btrfs_item_key_to_cpu(l, &key, slot);
1495
1496                 if (key.objectid < device->devid)
1497                         goto next;
1498
1499                 if (key.objectid > device->devid)
1500                         break;
1501
1502                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1503                         goto next;
1504
1505                 if (key.offset > search_start) {
1506                         hole_size = key.offset - search_start;
1507
1508                         /*
1509                          * Have to check before we set max_hole_start, otherwise
1510                          * we could end up sending back this offset anyway.
1511                          */
1512                         if (contains_pending_extent(transaction, device,
1513                                                     &search_start,
1514                                                     hole_size)) {
1515                                 if (key.offset >= search_start) {
1516                                         hole_size = key.offset - search_start;
1517                                 } else {
1518                                         WARN_ON_ONCE(1);
1519                                         hole_size = 0;
1520                                 }
1521                         }
1522
1523                         if (hole_size > max_hole_size) {
1524                                 max_hole_start = search_start;
1525                                 max_hole_size = hole_size;
1526                         }
1527
1528                         /*
1529                          * If this free space is greater than which we need,
1530                          * it must be the max free space that we have found
1531                          * until now, so max_hole_start must point to the start
1532                          * of this free space and the length of this free space
1533                          * is stored in max_hole_size. Thus, we return
1534                          * max_hole_start and max_hole_size and go back to the
1535                          * caller.
1536                          */
1537                         if (hole_size >= num_bytes) {
1538                                 ret = 0;
1539                                 goto out;
1540                         }
1541                 }
1542
1543                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1544                 extent_end = key.offset + btrfs_dev_extent_length(l,
1545                                                                   dev_extent);
1546                 if (extent_end > search_start)
1547                         search_start = extent_end;
1548 next:
1549                 path->slots[0]++;
1550                 cond_resched();
1551         }
1552
1553         /*
1554          * At this point, search_start should be the end of
1555          * allocated dev extents, and when shrinking the device,
1556          * search_end may be smaller than search_start.
1557          */
1558         if (search_end > search_start) {
1559                 hole_size = search_end - search_start;
1560
1561                 if (contains_pending_extent(transaction, device, &search_start,
1562                                             hole_size)) {
1563                         btrfs_release_path(path);
1564                         goto again;
1565                 }
1566
1567                 if (hole_size > max_hole_size) {
1568                         max_hole_start = search_start;
1569                         max_hole_size = hole_size;
1570                 }
1571         }
1572
1573         /* See above. */
1574         if (max_hole_size < num_bytes)
1575                 ret = -ENOSPC;
1576         else
1577                 ret = 0;
1578
1579 out:
1580         btrfs_free_path(path);
1581         *start = max_hole_start;
1582         if (len)
1583                 *len = max_hole_size;
1584         return ret;
1585 }
1586
1587 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1588                          struct btrfs_device *device, u64 num_bytes,
1589                          u64 *start, u64 *len)
1590 {
1591         /* FIXME use last free of some kind */
1592         return find_free_dev_extent_start(trans->transaction, device,
1593                                           num_bytes, 0, start, len);
1594 }
1595
1596 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1597                           struct btrfs_device *device,
1598                           u64 start, u64 *dev_extent_len)
1599 {
1600         struct btrfs_fs_info *fs_info = device->fs_info;
1601         struct btrfs_root *root = fs_info->dev_root;
1602         int ret;
1603         struct btrfs_path *path;
1604         struct btrfs_key key;
1605         struct btrfs_key found_key;
1606         struct extent_buffer *leaf = NULL;
1607         struct btrfs_dev_extent *extent = NULL;
1608
1609         path = btrfs_alloc_path();
1610         if (!path)
1611                 return -ENOMEM;
1612
1613         key.objectid = device->devid;
1614         key.offset = start;
1615         key.type = BTRFS_DEV_EXTENT_KEY;
1616 again:
1617         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1618         if (ret > 0) {
1619                 ret = btrfs_previous_item(root, path, key.objectid,
1620                                           BTRFS_DEV_EXTENT_KEY);
1621                 if (ret)
1622                         goto out;
1623                 leaf = path->nodes[0];
1624                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1625                 extent = btrfs_item_ptr(leaf, path->slots[0],
1626                                         struct btrfs_dev_extent);
1627                 BUG_ON(found_key.offset > start || found_key.offset +
1628                        btrfs_dev_extent_length(leaf, extent) < start);
1629                 key = found_key;
1630                 btrfs_release_path(path);
1631                 goto again;
1632         } else if (ret == 0) {
1633                 leaf = path->nodes[0];
1634                 extent = btrfs_item_ptr(leaf, path->slots[0],
1635                                         struct btrfs_dev_extent);
1636         } else {
1637                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1638                 goto out;
1639         }
1640
1641         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1642
1643         ret = btrfs_del_item(trans, root, path);
1644         if (ret) {
1645                 btrfs_handle_fs_error(fs_info, ret,
1646                                       "Failed to remove dev extent item");
1647         } else {
1648                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1649         }
1650 out:
1651         btrfs_free_path(path);
1652         return ret;
1653 }
1654
1655 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1656                                   struct btrfs_device *device,
1657                                   u64 chunk_offset, u64 start, u64 num_bytes)
1658 {
1659         int ret;
1660         struct btrfs_path *path;
1661         struct btrfs_fs_info *fs_info = device->fs_info;
1662         struct btrfs_root *root = fs_info->dev_root;
1663         struct btrfs_dev_extent *extent;
1664         struct extent_buffer *leaf;
1665         struct btrfs_key key;
1666
1667         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1668         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1669         path = btrfs_alloc_path();
1670         if (!path)
1671                 return -ENOMEM;
1672
1673         key.objectid = device->devid;
1674         key.offset = start;
1675         key.type = BTRFS_DEV_EXTENT_KEY;
1676         ret = btrfs_insert_empty_item(trans, root, path, &key,
1677                                       sizeof(*extent));
1678         if (ret)
1679                 goto out;
1680
1681         leaf = path->nodes[0];
1682         extent = btrfs_item_ptr(leaf, path->slots[0],
1683                                 struct btrfs_dev_extent);
1684         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1685                                         BTRFS_CHUNK_TREE_OBJECTID);
1686         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1687                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1688         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1689
1690         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1691         btrfs_mark_buffer_dirty(leaf);
1692 out:
1693         btrfs_free_path(path);
1694         return ret;
1695 }
1696
1697 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1698 {
1699         struct extent_map_tree *em_tree;
1700         struct extent_map *em;
1701         struct rb_node *n;
1702         u64 ret = 0;
1703
1704         em_tree = &fs_info->mapping_tree.map_tree;
1705         read_lock(&em_tree->lock);
1706         n = rb_last(&em_tree->map);
1707         if (n) {
1708                 em = rb_entry(n, struct extent_map, rb_node);
1709                 ret = em->start + em->len;
1710         }
1711         read_unlock(&em_tree->lock);
1712
1713         return ret;
1714 }
1715
1716 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1717                                     u64 *devid_ret)
1718 {
1719         int ret;
1720         struct btrfs_key key;
1721         struct btrfs_key found_key;
1722         struct btrfs_path *path;
1723
1724         path = btrfs_alloc_path();
1725         if (!path)
1726                 return -ENOMEM;
1727
1728         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1729         key.type = BTRFS_DEV_ITEM_KEY;
1730         key.offset = (u64)-1;
1731
1732         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1733         if (ret < 0)
1734                 goto error;
1735
1736         BUG_ON(ret == 0); /* Corruption */
1737
1738         ret = btrfs_previous_item(fs_info->chunk_root, path,
1739                                   BTRFS_DEV_ITEMS_OBJECTID,
1740                                   BTRFS_DEV_ITEM_KEY);
1741         if (ret) {
1742                 *devid_ret = 1;
1743         } else {
1744                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1745                                       path->slots[0]);
1746                 *devid_ret = found_key.offset + 1;
1747         }
1748         ret = 0;
1749 error:
1750         btrfs_free_path(path);
1751         return ret;
1752 }
1753
1754 /*
1755  * the device information is stored in the chunk root
1756  * the btrfs_device struct should be fully filled in
1757  */
1758 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1759                             struct btrfs_fs_info *fs_info,
1760                             struct btrfs_device *device)
1761 {
1762         struct btrfs_root *root = fs_info->chunk_root;
1763         int ret;
1764         struct btrfs_path *path;
1765         struct btrfs_dev_item *dev_item;
1766         struct extent_buffer *leaf;
1767         struct btrfs_key key;
1768         unsigned long ptr;
1769
1770         path = btrfs_alloc_path();
1771         if (!path)
1772                 return -ENOMEM;
1773
1774         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1775         key.type = BTRFS_DEV_ITEM_KEY;
1776         key.offset = device->devid;
1777
1778         ret = btrfs_insert_empty_item(trans, root, path, &key,
1779                                       sizeof(*dev_item));
1780         if (ret)
1781                 goto out;
1782
1783         leaf = path->nodes[0];
1784         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1785
1786         btrfs_set_device_id(leaf, dev_item, device->devid);
1787         btrfs_set_device_generation(leaf, dev_item, 0);
1788         btrfs_set_device_type(leaf, dev_item, device->type);
1789         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1790         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1791         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1792         btrfs_set_device_total_bytes(leaf, dev_item,
1793                                      btrfs_device_get_disk_total_bytes(device));
1794         btrfs_set_device_bytes_used(leaf, dev_item,
1795                                     btrfs_device_get_bytes_used(device));
1796         btrfs_set_device_group(leaf, dev_item, 0);
1797         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1798         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1799         btrfs_set_device_start_offset(leaf, dev_item, 0);
1800
1801         ptr = btrfs_device_uuid(dev_item);
1802         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1803         ptr = btrfs_device_fsid(dev_item);
1804         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1805         btrfs_mark_buffer_dirty(leaf);
1806
1807         ret = 0;
1808 out:
1809         btrfs_free_path(path);
1810         return ret;
1811 }
1812
1813 /*
1814  * Function to update ctime/mtime for a given device path.
1815  * Mainly used for ctime/mtime based probe like libblkid.
1816  */
1817 static void update_dev_time(const char *path_name)
1818 {
1819         struct file *filp;
1820
1821         filp = filp_open(path_name, O_RDWR, 0);
1822         if (IS_ERR(filp))
1823                 return;
1824         file_update_time(filp);
1825         filp_close(filp, NULL);
1826 }
1827
1828 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1829                              struct btrfs_device *device)
1830 {
1831         struct btrfs_root *root = fs_info->chunk_root;
1832         int ret;
1833         struct btrfs_path *path;
1834         struct btrfs_key key;
1835         struct btrfs_trans_handle *trans;
1836
1837         path = btrfs_alloc_path();
1838         if (!path)
1839                 return -ENOMEM;
1840
1841         trans = btrfs_start_transaction(root, 0);
1842         if (IS_ERR(trans)) {
1843                 btrfs_free_path(path);
1844                 return PTR_ERR(trans);
1845         }
1846         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1847         key.type = BTRFS_DEV_ITEM_KEY;
1848         key.offset = device->devid;
1849
1850         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1851         if (ret) {
1852                 if (ret > 0)
1853                         ret = -ENOENT;
1854                 btrfs_abort_transaction(trans, ret);
1855                 btrfs_end_transaction(trans);
1856                 goto out;
1857         }
1858
1859         ret = btrfs_del_item(trans, root, path);
1860         if (ret) {
1861                 btrfs_abort_transaction(trans, ret);
1862                 btrfs_end_transaction(trans);
1863         }
1864
1865 out:
1866         btrfs_free_path(path);
1867         if (!ret)
1868                 ret = btrfs_commit_transaction(trans);
1869         return ret;
1870 }
1871
1872 /*
1873  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1874  * filesystem. It's up to the caller to adjust that number regarding eg. device
1875  * replace.
1876  */
1877 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1878                 u64 num_devices)
1879 {
1880         u64 all_avail;
1881         unsigned seq;
1882         int i;
1883
1884         do {
1885                 seq = read_seqbegin(&fs_info->profiles_lock);
1886
1887                 all_avail = fs_info->avail_data_alloc_bits |
1888                             fs_info->avail_system_alloc_bits |
1889                             fs_info->avail_metadata_alloc_bits;
1890         } while (read_seqretry(&fs_info->profiles_lock, seq));
1891
1892         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1893                 if (!(all_avail & btrfs_raid_group[i]))
1894                         continue;
1895
1896                 if (num_devices < btrfs_raid_array[i].devs_min) {
1897                         int ret = btrfs_raid_mindev_error[i];
1898
1899                         if (ret)
1900                                 return ret;
1901                 }
1902         }
1903
1904         return 0;
1905 }
1906
1907 static struct btrfs_device * btrfs_find_next_active_device(
1908                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1909 {
1910         struct btrfs_device *next_device;
1911
1912         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1913                 if (next_device != device &&
1914                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1915                     && next_device->bdev)
1916                         return next_device;
1917         }
1918
1919         return NULL;
1920 }
1921
1922 /*
1923  * Helper function to check if the given device is part of s_bdev / latest_bdev
1924  * and replace it with the provided or the next active device, in the context
1925  * where this function called, there should be always be another device (or
1926  * this_dev) which is active.
1927  */
1928 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1929                 struct btrfs_device *device, struct btrfs_device *this_dev)
1930 {
1931         struct btrfs_device *next_device;
1932
1933         if (this_dev)
1934                 next_device = this_dev;
1935         else
1936                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1937                                                                 device);
1938         ASSERT(next_device);
1939
1940         if (fs_info->sb->s_bdev &&
1941                         (fs_info->sb->s_bdev == device->bdev))
1942                 fs_info->sb->s_bdev = next_device->bdev;
1943
1944         if (fs_info->fs_devices->latest_bdev == device->bdev)
1945                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1946 }
1947
1948 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1949                 u64 devid)
1950 {
1951         struct btrfs_device *device;
1952         struct btrfs_fs_devices *cur_devices;
1953         u64 num_devices;
1954         int ret = 0;
1955
1956         mutex_lock(&fs_info->volume_mutex);
1957         mutex_lock(&uuid_mutex);
1958
1959         num_devices = fs_info->fs_devices->num_devices;
1960         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1961         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1962                 WARN_ON(num_devices < 1);
1963                 num_devices--;
1964         }
1965         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1966
1967         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1968         if (ret)
1969                 goto out;
1970
1971         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1972                                            &device);
1973         if (ret)
1974                 goto out;
1975
1976         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1977                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1978                 goto out;
1979         }
1980
1981         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1982             fs_info->fs_devices->rw_devices == 1) {
1983                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1984                 goto out;
1985         }
1986
1987         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1988                 mutex_lock(&fs_info->chunk_mutex);
1989                 list_del_init(&device->dev_alloc_list);
1990                 device->fs_devices->rw_devices--;
1991                 mutex_unlock(&fs_info->chunk_mutex);
1992         }
1993
1994         mutex_unlock(&uuid_mutex);
1995         ret = btrfs_shrink_device(device, 0);
1996         mutex_lock(&uuid_mutex);
1997         if (ret)
1998                 goto error_undo;
1999
2000         /*
2001          * TODO: the superblock still includes this device in its num_devices
2002          * counter although write_all_supers() is not locked out. This
2003          * could give a filesystem state which requires a degraded mount.
2004          */
2005         ret = btrfs_rm_dev_item(fs_info, device);
2006         if (ret)
2007                 goto error_undo;
2008
2009         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2010         btrfs_scrub_cancel_dev(fs_info, device);
2011
2012         /*
2013          * the device list mutex makes sure that we don't change
2014          * the device list while someone else is writing out all
2015          * the device supers. Whoever is writing all supers, should
2016          * lock the device list mutex before getting the number of
2017          * devices in the super block (super_copy). Conversely,
2018          * whoever updates the number of devices in the super block
2019          * (super_copy) should hold the device list mutex.
2020          */
2021
2022         cur_devices = device->fs_devices;
2023         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2024         list_del_rcu(&device->dev_list);
2025
2026         device->fs_devices->num_devices--;
2027         device->fs_devices->total_devices--;
2028
2029         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2030                 device->fs_devices->missing_devices--;
2031
2032         btrfs_assign_next_active_device(fs_info, device, NULL);
2033
2034         if (device->bdev) {
2035                 device->fs_devices->open_devices--;
2036                 /* remove sysfs entry */
2037                 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2038         }
2039
2040         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2041         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2042         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2043
2044         /*
2045          * at this point, the device is zero sized and detached from
2046          * the devices list.  All that's left is to zero out the old
2047          * supers and free the device.
2048          */
2049         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2050                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2051
2052         btrfs_close_bdev(device);
2053         call_rcu(&device->rcu, free_device_rcu);
2054
2055         if (cur_devices->open_devices == 0) {
2056                 struct btrfs_fs_devices *fs_devices;
2057                 fs_devices = fs_info->fs_devices;
2058                 while (fs_devices) {
2059                         if (fs_devices->seed == cur_devices) {
2060                                 fs_devices->seed = cur_devices->seed;
2061                                 break;
2062                         }
2063                         fs_devices = fs_devices->seed;
2064                 }
2065                 cur_devices->seed = NULL;
2066                 close_fs_devices(cur_devices);
2067                 free_fs_devices(cur_devices);
2068         }
2069
2070 out:
2071         mutex_unlock(&uuid_mutex);
2072         mutex_unlock(&fs_info->volume_mutex);
2073         return ret;
2074
2075 error_undo:
2076         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2077                 mutex_lock(&fs_info->chunk_mutex);
2078                 list_add(&device->dev_alloc_list,
2079                          &fs_info->fs_devices->alloc_list);
2080                 device->fs_devices->rw_devices++;
2081                 mutex_unlock(&fs_info->chunk_mutex);
2082         }
2083         goto out;
2084 }
2085
2086 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
2087                                         struct btrfs_device *srcdev)
2088 {
2089         struct btrfs_fs_devices *fs_devices;
2090
2091         lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2092
2093         /*
2094          * in case of fs with no seed, srcdev->fs_devices will point
2095          * to fs_devices of fs_info. However when the dev being replaced is
2096          * a seed dev it will point to the seed's local fs_devices. In short
2097          * srcdev will have its correct fs_devices in both the cases.
2098          */
2099         fs_devices = srcdev->fs_devices;
2100
2101         list_del_rcu(&srcdev->dev_list);
2102         list_del(&srcdev->dev_alloc_list);
2103         fs_devices->num_devices--;
2104         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2105                 fs_devices->missing_devices--;
2106
2107         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2108                 fs_devices->rw_devices--;
2109
2110         if (srcdev->bdev)
2111                 fs_devices->open_devices--;
2112 }
2113
2114 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2115                                       struct btrfs_device *srcdev)
2116 {
2117         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2118
2119         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2120                 /* zero out the old super if it is writable */
2121                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2122         }
2123
2124         btrfs_close_bdev(srcdev);
2125         call_rcu(&srcdev->rcu, free_device_rcu);
2126
2127         /* if this is no devs we rather delete the fs_devices */
2128         if (!fs_devices->num_devices) {
2129                 struct btrfs_fs_devices *tmp_fs_devices;
2130
2131                 /*
2132                  * On a mounted FS, num_devices can't be zero unless it's a
2133                  * seed. In case of a seed device being replaced, the replace
2134                  * target added to the sprout FS, so there will be no more
2135                  * device left under the seed FS.
2136                  */
2137                 ASSERT(fs_devices->seeding);
2138
2139                 tmp_fs_devices = fs_info->fs_devices;
2140                 while (tmp_fs_devices) {
2141                         if (tmp_fs_devices->seed == fs_devices) {
2142                                 tmp_fs_devices->seed = fs_devices->seed;
2143                                 break;
2144                         }
2145                         tmp_fs_devices = tmp_fs_devices->seed;
2146                 }
2147                 fs_devices->seed = NULL;
2148                 close_fs_devices(fs_devices);
2149                 free_fs_devices(fs_devices);
2150         }
2151 }
2152
2153 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2154                                       struct btrfs_device *tgtdev)
2155 {
2156         mutex_lock(&uuid_mutex);
2157         WARN_ON(!tgtdev);
2158         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2159
2160         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2161
2162         if (tgtdev->bdev)
2163                 fs_info->fs_devices->open_devices--;
2164
2165         fs_info->fs_devices->num_devices--;
2166
2167         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2168
2169         list_del_rcu(&tgtdev->dev_list);
2170
2171         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2172         mutex_unlock(&uuid_mutex);
2173
2174         /*
2175          * The update_dev_time() with in btrfs_scratch_superblocks()
2176          * may lead to a call to btrfs_show_devname() which will try
2177          * to hold device_list_mutex. And here this device
2178          * is already out of device list, so we don't have to hold
2179          * the device_list_mutex lock.
2180          */
2181         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2182
2183         btrfs_close_bdev(tgtdev);
2184         call_rcu(&tgtdev->rcu, free_device_rcu);
2185 }
2186
2187 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2188                                      const char *device_path,
2189                                      struct btrfs_device **device)
2190 {
2191         int ret = 0;
2192         struct btrfs_super_block *disk_super;
2193         u64 devid;
2194         u8 *dev_uuid;
2195         struct block_device *bdev;
2196         struct buffer_head *bh;
2197
2198         *device = NULL;
2199         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2200                                     fs_info->bdev_holder, 0, &bdev, &bh);
2201         if (ret)
2202                 return ret;
2203         disk_super = (struct btrfs_super_block *)bh->b_data;
2204         devid = btrfs_stack_device_id(&disk_super->dev_item);
2205         dev_uuid = disk_super->dev_item.uuid;
2206         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2207         brelse(bh);
2208         if (!*device)
2209                 ret = -ENOENT;
2210         blkdev_put(bdev, FMODE_READ);
2211         return ret;
2212 }
2213
2214 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2215                                          const char *device_path,
2216                                          struct btrfs_device **device)
2217 {
2218         *device = NULL;
2219         if (strcmp(device_path, "missing") == 0) {
2220                 struct list_head *devices;
2221                 struct btrfs_device *tmp;
2222
2223                 devices = &fs_info->fs_devices->devices;
2224                 /*
2225                  * It is safe to read the devices since the volume_mutex
2226                  * is held by the caller.
2227                  */
2228                 list_for_each_entry(tmp, devices, dev_list) {
2229                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2230                                         &tmp->dev_state) && !tmp->bdev) {
2231                                 *device = tmp;
2232                                 break;
2233                         }
2234                 }
2235
2236                 if (!*device)
2237                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2238
2239                 return 0;
2240         } else {
2241                 return btrfs_find_device_by_path(fs_info, device_path, device);
2242         }
2243 }
2244
2245 /*
2246  * Lookup a device given by device id, or the path if the id is 0.
2247  */
2248 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2249                                  const char *devpath,
2250                                  struct btrfs_device **device)
2251 {
2252         int ret;
2253
2254         if (devid) {
2255                 ret = 0;
2256                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2257                 if (!*device)
2258                         ret = -ENOENT;
2259         } else {
2260                 if (!devpath || !devpath[0])
2261                         return -EINVAL;
2262
2263                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2264                                                            device);
2265         }
2266         return ret;
2267 }
2268
2269 /*
2270  * does all the dirty work required for changing file system's UUID.
2271  */
2272 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2273 {
2274         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2275         struct btrfs_fs_devices *old_devices;
2276         struct btrfs_fs_devices *seed_devices;
2277         struct btrfs_super_block *disk_super = fs_info->super_copy;
2278         struct btrfs_device *device;
2279         u64 super_flags;
2280
2281         lockdep_assert_held(&uuid_mutex);
2282         if (!fs_devices->seeding)
2283                 return -EINVAL;
2284
2285         seed_devices = alloc_fs_devices(NULL);
2286         if (IS_ERR(seed_devices))
2287                 return PTR_ERR(seed_devices);
2288
2289         old_devices = clone_fs_devices(fs_devices);
2290         if (IS_ERR(old_devices)) {
2291                 kfree(seed_devices);
2292                 return PTR_ERR(old_devices);
2293         }
2294
2295         list_add(&old_devices->fs_list, &fs_uuids);
2296
2297         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2298         seed_devices->opened = 1;
2299         INIT_LIST_HEAD(&seed_devices->devices);
2300         INIT_LIST_HEAD(&seed_devices->alloc_list);
2301         mutex_init(&seed_devices->device_list_mutex);
2302
2303         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2304         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2305                               synchronize_rcu);
2306         list_for_each_entry(device, &seed_devices->devices, dev_list)
2307                 device->fs_devices = seed_devices;
2308
2309         mutex_lock(&fs_info->chunk_mutex);
2310         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2311         mutex_unlock(&fs_info->chunk_mutex);
2312
2313         fs_devices->seeding = 0;
2314         fs_devices->num_devices = 0;
2315         fs_devices->open_devices = 0;
2316         fs_devices->missing_devices = 0;
2317         fs_devices->rotating = 0;
2318         fs_devices->seed = seed_devices;
2319
2320         generate_random_uuid(fs_devices->fsid);
2321         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2322         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2323         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2324
2325         super_flags = btrfs_super_flags(disk_super) &
2326                       ~BTRFS_SUPER_FLAG_SEEDING;
2327         btrfs_set_super_flags(disk_super, super_flags);
2328
2329         return 0;
2330 }
2331
2332 /*
2333  * Store the expected generation for seed devices in device items.
2334  */
2335 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2336                                struct btrfs_fs_info *fs_info)
2337 {
2338         struct btrfs_root *root = fs_info->chunk_root;
2339         struct btrfs_path *path;
2340         struct extent_buffer *leaf;
2341         struct btrfs_dev_item *dev_item;
2342         struct btrfs_device *device;
2343         struct btrfs_key key;
2344         u8 fs_uuid[BTRFS_FSID_SIZE];
2345         u8 dev_uuid[BTRFS_UUID_SIZE];
2346         u64 devid;
2347         int ret;
2348
2349         path = btrfs_alloc_path();
2350         if (!path)
2351                 return -ENOMEM;
2352
2353         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2354         key.offset = 0;
2355         key.type = BTRFS_DEV_ITEM_KEY;
2356
2357         while (1) {
2358                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2359                 if (ret < 0)
2360                         goto error;
2361
2362                 leaf = path->nodes[0];
2363 next_slot:
2364                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2365                         ret = btrfs_next_leaf(root, path);
2366                         if (ret > 0)
2367                                 break;
2368                         if (ret < 0)
2369                                 goto error;
2370                         leaf = path->nodes[0];
2371                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2372                         btrfs_release_path(path);
2373                         continue;
2374                 }
2375
2376                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2377                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2378                     key.type != BTRFS_DEV_ITEM_KEY)
2379                         break;
2380
2381                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2382                                           struct btrfs_dev_item);
2383                 devid = btrfs_device_id(leaf, dev_item);
2384                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2385                                    BTRFS_UUID_SIZE);
2386                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2387                                    BTRFS_FSID_SIZE);
2388                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2389                 BUG_ON(!device); /* Logic error */
2390
2391                 if (device->fs_devices->seeding) {
2392                         btrfs_set_device_generation(leaf, dev_item,
2393                                                     device->generation);
2394                         btrfs_mark_buffer_dirty(leaf);
2395                 }
2396
2397                 path->slots[0]++;
2398                 goto next_slot;
2399         }
2400         ret = 0;
2401 error:
2402         btrfs_free_path(path);
2403         return ret;
2404 }
2405
2406 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2407 {
2408         struct btrfs_root *root = fs_info->dev_root;
2409         struct request_queue *q;
2410         struct btrfs_trans_handle *trans;
2411         struct btrfs_device *device;
2412         struct block_device *bdev;
2413         struct list_head *devices;
2414         struct super_block *sb = fs_info->sb;
2415         struct rcu_string *name;
2416         u64 tmp;
2417         int seeding_dev = 0;
2418         int ret = 0;
2419         bool unlocked = false;
2420
2421         if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2422                 return -EROFS;
2423
2424         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2425                                   fs_info->bdev_holder);
2426         if (IS_ERR(bdev))
2427                 return PTR_ERR(bdev);
2428
2429         if (fs_info->fs_devices->seeding) {
2430                 seeding_dev = 1;
2431                 down_write(&sb->s_umount);
2432                 mutex_lock(&uuid_mutex);
2433         }
2434
2435         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2436
2437         devices = &fs_info->fs_devices->devices;
2438
2439         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2440         list_for_each_entry(device, devices, dev_list) {
2441                 if (device->bdev == bdev) {
2442                         ret = -EEXIST;
2443                         mutex_unlock(
2444                                 &fs_info->fs_devices->device_list_mutex);
2445                         goto error;
2446                 }
2447         }
2448         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2449
2450         device = btrfs_alloc_device(fs_info, NULL, NULL);
2451         if (IS_ERR(device)) {
2452                 /* we can safely leave the fs_devices entry around */
2453                 ret = PTR_ERR(device);
2454                 goto error;
2455         }
2456
2457         name = rcu_string_strdup(device_path, GFP_KERNEL);
2458         if (!name) {
2459                 ret = -ENOMEM;
2460                 goto error_free_device;
2461         }
2462         rcu_assign_pointer(device->name, name);
2463
2464         trans = btrfs_start_transaction(root, 0);
2465         if (IS_ERR(trans)) {
2466                 ret = PTR_ERR(trans);
2467                 goto error_free_device;
2468         }
2469
2470         q = bdev_get_queue(bdev);
2471         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2472         device->generation = trans->transid;
2473         device->io_width = fs_info->sectorsize;
2474         device->io_align = fs_info->sectorsize;
2475         device->sector_size = fs_info->sectorsize;
2476         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2477                                          fs_info->sectorsize);
2478         device->disk_total_bytes = device->total_bytes;
2479         device->commit_total_bytes = device->total_bytes;
2480         device->fs_info = fs_info;
2481         device->bdev = bdev;
2482         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2483         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2484         device->mode = FMODE_EXCL;
2485         device->dev_stats_valid = 1;
2486         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2487
2488         if (seeding_dev) {
2489                 sb->s_flags &= ~SB_RDONLY;
2490                 ret = btrfs_prepare_sprout(fs_info);
2491                 if (ret) {
2492                         btrfs_abort_transaction(trans, ret);
2493                         goto error_trans;
2494                 }
2495         }
2496
2497         device->fs_devices = fs_info->fs_devices;
2498
2499         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2500         mutex_lock(&fs_info->chunk_mutex);
2501         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2502         list_add(&device->dev_alloc_list,
2503                  &fs_info->fs_devices->alloc_list);
2504         fs_info->fs_devices->num_devices++;
2505         fs_info->fs_devices->open_devices++;
2506         fs_info->fs_devices->rw_devices++;
2507         fs_info->fs_devices->total_devices++;
2508         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2509
2510         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2511
2512         if (!blk_queue_nonrot(q))
2513                 fs_info->fs_devices->rotating = 1;
2514
2515         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2516         btrfs_set_super_total_bytes(fs_info->super_copy,
2517                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2518
2519         tmp = btrfs_super_num_devices(fs_info->super_copy);
2520         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2521
2522         /* add sysfs device entry */
2523         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2524
2525         /*
2526          * we've got more storage, clear any full flags on the space
2527          * infos
2528          */
2529         btrfs_clear_space_info_full(fs_info);
2530
2531         mutex_unlock(&fs_info->chunk_mutex);
2532         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2533
2534         if (seeding_dev) {
2535                 mutex_lock(&fs_info->chunk_mutex);
2536                 ret = init_first_rw_device(trans, fs_info);
2537                 mutex_unlock(&fs_info->chunk_mutex);
2538                 if (ret) {
2539                         btrfs_abort_transaction(trans, ret);
2540                         goto error_sysfs;
2541                 }
2542         }
2543
2544         ret = btrfs_add_dev_item(trans, fs_info, device);
2545         if (ret) {
2546                 btrfs_abort_transaction(trans, ret);
2547                 goto error_sysfs;
2548         }
2549
2550         if (seeding_dev) {
2551                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2552
2553                 ret = btrfs_finish_sprout(trans, fs_info);
2554                 if (ret) {
2555                         btrfs_abort_transaction(trans, ret);
2556                         goto error_sysfs;
2557                 }
2558
2559                 /* Sprouting would change fsid of the mounted root,
2560                  * so rename the fsid on the sysfs
2561                  */
2562                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2563                                                 fs_info->fsid);
2564                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2565                         btrfs_warn(fs_info,
2566                                    "sysfs: failed to create fsid for sprout");
2567         }
2568
2569         ret = btrfs_commit_transaction(trans);
2570
2571         if (seeding_dev) {
2572                 mutex_unlock(&uuid_mutex);
2573                 up_write(&sb->s_umount);
2574                 unlocked = true;
2575
2576                 if (ret) /* transaction commit */
2577                         return ret;
2578
2579                 ret = btrfs_relocate_sys_chunks(fs_info);
2580                 if (ret < 0)
2581                         btrfs_handle_fs_error(fs_info, ret,
2582                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2583                 trans = btrfs_attach_transaction(root);
2584                 if (IS_ERR(trans)) {
2585                         if (PTR_ERR(trans) == -ENOENT)
2586                                 return 0;
2587                         ret = PTR_ERR(trans);
2588                         trans = NULL;
2589                         goto error_sysfs;
2590                 }
2591                 ret = btrfs_commit_transaction(trans);
2592         }
2593
2594         /* Update ctime/mtime for libblkid */
2595         update_dev_time(device_path);
2596         return ret;
2597
2598 error_sysfs:
2599         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2600 error_trans:
2601         if (seeding_dev)
2602                 sb->s_flags |= SB_RDONLY;
2603         if (trans)
2604                 btrfs_end_transaction(trans);
2605 error_free_device:
2606         free_device(device);
2607 error:
2608         blkdev_put(bdev, FMODE_EXCL);
2609         if (seeding_dev && !unlocked) {
2610                 mutex_unlock(&uuid_mutex);
2611                 up_write(&sb->s_umount);
2612         }
2613         return ret;
2614 }
2615
2616 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2617                                   const char *device_path,
2618                                   struct btrfs_device *srcdev,
2619                                   struct btrfs_device **device_out)
2620 {
2621         struct btrfs_device *device;
2622         struct block_device *bdev;
2623         struct list_head *devices;
2624         struct rcu_string *name;
2625         u64 devid = BTRFS_DEV_REPLACE_DEVID;
2626         int ret = 0;
2627
2628         *device_out = NULL;
2629         if (fs_info->fs_devices->seeding) {
2630                 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2631                 return -EINVAL;
2632         }
2633
2634         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2635                                   fs_info->bdev_holder);
2636         if (IS_ERR(bdev)) {
2637                 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2638                 return PTR_ERR(bdev);
2639         }
2640
2641         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2642
2643         devices = &fs_info->fs_devices->devices;
2644         list_for_each_entry(device, devices, dev_list) {
2645                 if (device->bdev == bdev) {
2646                         btrfs_err(fs_info,
2647                                   "target device is in the filesystem!");
2648                         ret = -EEXIST;
2649                         goto error;
2650                 }
2651         }
2652
2653
2654         if (i_size_read(bdev->bd_inode) <
2655             btrfs_device_get_total_bytes(srcdev)) {
2656                 btrfs_err(fs_info,
2657                           "target device is smaller than source device!");
2658                 ret = -EINVAL;
2659                 goto error;
2660         }
2661
2662
2663         device = btrfs_alloc_device(NULL, &devid, NULL);
2664         if (IS_ERR(device)) {
2665                 ret = PTR_ERR(device);
2666                 goto error;
2667         }
2668
2669         name = rcu_string_strdup(device_path, GFP_KERNEL);
2670         if (!name) {
2671                 free_device(device);
2672                 ret = -ENOMEM;
2673                 goto error;
2674         }
2675         rcu_assign_pointer(device->name, name);
2676
2677         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2678         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2679         device->generation = 0;
2680         device->io_width = fs_info->sectorsize;
2681         device->io_align = fs_info->sectorsize;
2682         device->sector_size = fs_info->sectorsize;
2683         device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2684         device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2685         device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2686         device->commit_total_bytes = srcdev->commit_total_bytes;
2687         device->commit_bytes_used = device->bytes_used;
2688         device->fs_info = fs_info;
2689         device->bdev = bdev;
2690         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2691         set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2692         device->mode = FMODE_EXCL;
2693         device->dev_stats_valid = 1;
2694         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2695         device->fs_devices = fs_info->fs_devices;
2696         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2697         fs_info->fs_devices->num_devices++;
2698         fs_info->fs_devices->open_devices++;
2699         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2700
2701         *device_out = device;
2702         return ret;
2703
2704 error:
2705         blkdev_put(bdev, FMODE_EXCL);
2706         return ret;
2707 }
2708
2709 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2710                                         struct btrfs_device *device)
2711 {
2712         int ret;
2713         struct btrfs_path *path;
2714         struct btrfs_root *root = device->fs_info->chunk_root;
2715         struct btrfs_dev_item *dev_item;
2716         struct extent_buffer *leaf;
2717         struct btrfs_key key;
2718
2719         path = btrfs_alloc_path();
2720         if (!path)
2721                 return -ENOMEM;
2722
2723         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2724         key.type = BTRFS_DEV_ITEM_KEY;
2725         key.offset = device->devid;
2726
2727         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2728         if (ret < 0)
2729                 goto out;
2730
2731         if (ret > 0) {
2732                 ret = -ENOENT;
2733                 goto out;
2734         }
2735
2736         leaf = path->nodes[0];
2737         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2738
2739         btrfs_set_device_id(leaf, dev_item, device->devid);
2740         btrfs_set_device_type(leaf, dev_item, device->type);
2741         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2742         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2743         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2744         btrfs_set_device_total_bytes(leaf, dev_item,
2745                                      btrfs_device_get_disk_total_bytes(device));
2746         btrfs_set_device_bytes_used(leaf, dev_item,
2747                                     btrfs_device_get_bytes_used(device));
2748         btrfs_mark_buffer_dirty(leaf);
2749
2750 out:
2751         btrfs_free_path(path);
2752         return ret;
2753 }
2754
2755 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2756                       struct btrfs_device *device, u64 new_size)
2757 {
2758         struct btrfs_fs_info *fs_info = device->fs_info;
2759         struct btrfs_super_block *super_copy = fs_info->super_copy;
2760         struct btrfs_fs_devices *fs_devices;
2761         u64 old_total;
2762         u64 diff;
2763
2764         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2765                 return -EACCES;
2766
2767         new_size = round_down(new_size, fs_info->sectorsize);
2768
2769         mutex_lock(&fs_info->chunk_mutex);
2770         old_total = btrfs_super_total_bytes(super_copy);
2771         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2772
2773         if (new_size <= device->total_bytes ||
2774             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2775                 mutex_unlock(&fs_info->chunk_mutex);
2776                 return -EINVAL;
2777         }
2778
2779         fs_devices = fs_info->fs_devices;
2780
2781         btrfs_set_super_total_bytes(super_copy,
2782                         round_down(old_total + diff, fs_info->sectorsize));
2783         device->fs_devices->total_rw_bytes += diff;
2784
2785         btrfs_device_set_total_bytes(device, new_size);
2786         btrfs_device_set_disk_total_bytes(device, new_size);
2787         btrfs_clear_space_info_full(device->fs_info);
2788         if (list_empty(&device->resized_list))
2789                 list_add_tail(&device->resized_list,
2790                               &fs_devices->resized_devices);
2791         mutex_unlock(&fs_info->chunk_mutex);
2792
2793         return btrfs_update_device(trans, device);
2794 }
2795
2796 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2797                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2798 {
2799         struct btrfs_root *root = fs_info->chunk_root;
2800         int ret;
2801         struct btrfs_path *path;
2802         struct btrfs_key key;
2803
2804         path = btrfs_alloc_path();
2805         if (!path)
2806                 return -ENOMEM;
2807
2808         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2809         key.offset = chunk_offset;
2810         key.type = BTRFS_CHUNK_ITEM_KEY;
2811
2812         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2813         if (ret < 0)
2814                 goto out;
2815         else if (ret > 0) { /* Logic error or corruption */
2816                 btrfs_handle_fs_error(fs_info, -ENOENT,
2817                                       "Failed lookup while freeing chunk.");
2818                 ret = -ENOENT;
2819                 goto out;
2820         }
2821
2822         ret = btrfs_del_item(trans, root, path);
2823         if (ret < 0)
2824                 btrfs_handle_fs_error(fs_info, ret,
2825                                       "Failed to delete chunk item.");
2826 out:
2827         btrfs_free_path(path);
2828         return ret;
2829 }
2830
2831 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2832 {
2833         struct btrfs_super_block *super_copy = fs_info->super_copy;
2834         struct btrfs_disk_key *disk_key;
2835         struct btrfs_chunk *chunk;
2836         u8 *ptr;
2837         int ret = 0;
2838         u32 num_stripes;
2839         u32 array_size;
2840         u32 len = 0;
2841         u32 cur;
2842         struct btrfs_key key;
2843
2844         mutex_lock(&fs_info->chunk_mutex);
2845         array_size = btrfs_super_sys_array_size(super_copy);
2846
2847         ptr = super_copy->sys_chunk_array;
2848         cur = 0;
2849
2850         while (cur < array_size) {
2851                 disk_key = (struct btrfs_disk_key *)ptr;
2852                 btrfs_disk_key_to_cpu(&key, disk_key);
2853
2854                 len = sizeof(*disk_key);
2855
2856                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2857                         chunk = (struct btrfs_chunk *)(ptr + len);
2858                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2859                         len += btrfs_chunk_item_size(num_stripes);
2860                 } else {
2861                         ret = -EIO;
2862                         break;
2863                 }
2864                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2865                     key.offset == chunk_offset) {
2866                         memmove(ptr, ptr + len, array_size - (cur + len));
2867                         array_size -= len;
2868                         btrfs_set_super_sys_array_size(super_copy, array_size);
2869                 } else {
2870                         ptr += len;
2871                         cur += len;
2872                 }
2873         }
2874         mutex_unlock(&fs_info->chunk_mutex);
2875         return ret;
2876 }
2877
2878 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2879                                         u64 logical, u64 length)
2880 {
2881         struct extent_map_tree *em_tree;
2882         struct extent_map *em;
2883
2884         em_tree = &fs_info->mapping_tree.map_tree;
2885         read_lock(&em_tree->lock);
2886         em = lookup_extent_mapping(em_tree, logical, length);
2887         read_unlock(&em_tree->lock);
2888
2889         if (!em) {
2890                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2891                            logical, length);
2892                 return ERR_PTR(-EINVAL);
2893         }
2894
2895         if (em->start > logical || em->start + em->len < logical) {
2896                 btrfs_crit(fs_info,
2897                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2898                            logical, length, em->start, em->start + em->len);
2899                 free_extent_map(em);
2900                 return ERR_PTR(-EINVAL);
2901         }
2902
2903         /* callers are responsible for dropping em's ref. */
2904         return em;
2905 }
2906
2907 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2908                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2909 {
2910         struct extent_map *em;
2911         struct map_lookup *map;
2912         u64 dev_extent_len = 0;
2913         int i, ret = 0;
2914         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2915
2916         em = get_chunk_map(fs_info, chunk_offset, 1);
2917         if (IS_ERR(em)) {
2918                 /*
2919                  * This is a logic error, but we don't want to just rely on the
2920                  * user having built with ASSERT enabled, so if ASSERT doesn't
2921                  * do anything we still error out.
2922                  */
2923                 ASSERT(0);
2924                 return PTR_ERR(em);
2925         }
2926         map = em->map_lookup;
2927         mutex_lock(&fs_info->chunk_mutex);
2928         check_system_chunk(trans, fs_info, map->type);
2929         mutex_unlock(&fs_info->chunk_mutex);
2930
2931         /*
2932          * Take the device list mutex to prevent races with the final phase of
2933          * a device replace operation that replaces the device object associated
2934          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2935          */
2936         mutex_lock(&fs_devices->device_list_mutex);
2937         for (i = 0; i < map->num_stripes; i++) {
2938                 struct btrfs_device *device = map->stripes[i].dev;
2939                 ret = btrfs_free_dev_extent(trans, device,
2940                                             map->stripes[i].physical,
2941                                             &dev_extent_len);
2942                 if (ret) {
2943                         mutex_unlock(&fs_devices->device_list_mutex);
2944                         btrfs_abort_transaction(trans, ret);
2945                         goto out;
2946                 }
2947
2948                 if (device->bytes_used > 0) {
2949                         mutex_lock(&fs_info->chunk_mutex);
2950                         btrfs_device_set_bytes_used(device,
2951                                         device->bytes_used - dev_extent_len);
2952                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2953                         btrfs_clear_space_info_full(fs_info);
2954                         mutex_unlock(&fs_info->chunk_mutex);
2955                 }
2956
2957                 if (map->stripes[i].dev) {
2958                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2959                         if (ret) {
2960                                 mutex_unlock(&fs_devices->device_list_mutex);
2961                                 btrfs_abort_transaction(trans, ret);
2962                                 goto out;
2963                         }
2964                 }
2965         }
2966         mutex_unlock(&fs_devices->device_list_mutex);
2967
2968         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2969         if (ret) {
2970                 btrfs_abort_transaction(trans, ret);
2971                 goto out;
2972         }
2973
2974         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2975
2976         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2977                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2978                 if (ret) {
2979                         btrfs_abort_transaction(trans, ret);
2980                         goto out;
2981                 }
2982         }
2983
2984         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2985         if (ret) {
2986                 btrfs_abort_transaction(trans, ret);
2987                 goto out;
2988         }
2989
2990 out:
2991         /* once for us */
2992         free_extent_map(em);
2993         return ret;
2994 }
2995
2996 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2997 {
2998         struct btrfs_root *root = fs_info->chunk_root;
2999         struct btrfs_trans_handle *trans;
3000         int ret;
3001
3002         /*
3003          * Prevent races with automatic removal of unused block groups.
3004          * After we relocate and before we remove the chunk with offset
3005          * chunk_offset, automatic removal of the block group can kick in,
3006          * resulting in a failure when calling btrfs_remove_chunk() below.
3007          *
3008          * Make sure to acquire this mutex before doing a tree search (dev
3009          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3010          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3011          * we release the path used to search the chunk/dev tree and before
3012          * the current task acquires this mutex and calls us.
3013          */
3014         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3015
3016         ret = btrfs_can_relocate(fs_info, chunk_offset);
3017         if (ret)
3018                 return -ENOSPC;
3019
3020         /* step one, relocate all the extents inside this chunk */
3021         btrfs_scrub_pause(fs_info);
3022         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3023         btrfs_scrub_continue(fs_info);
3024         if (ret)
3025                 return ret;
3026
3027         /*
3028          * We add the kobjects here (and after forcing data chunk creation)
3029          * since relocation is the only place we'll create chunks of a new
3030          * type at runtime.  The only place where we'll remove the last
3031          * chunk of a type is the call immediately below this one.  Even
3032          * so, we're protected against races with the cleaner thread since
3033          * we're covered by the delete_unused_bgs_mutex.
3034          */
3035         btrfs_add_raid_kobjects(fs_info);
3036
3037         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3038                                                      chunk_offset);
3039         if (IS_ERR(trans)) {
3040                 ret = PTR_ERR(trans);
3041                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3042                 return ret;
3043         }
3044
3045         /*
3046          * step two, delete the device extents and the
3047          * chunk tree entries
3048          */
3049         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
3050         btrfs_end_transaction(trans);
3051         return ret;
3052 }
3053
3054 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3055 {
3056         struct btrfs_root *chunk_root = fs_info->chunk_root;
3057         struct btrfs_path *path;
3058         struct extent_buffer *leaf;
3059         struct btrfs_chunk *chunk;
3060         struct btrfs_key key;
3061         struct btrfs_key found_key;
3062         u64 chunk_type;
3063         bool retried = false;
3064         int failed = 0;
3065         int ret;
3066
3067         path = btrfs_alloc_path();
3068         if (!path)
3069                 return -ENOMEM;
3070
3071 again:
3072         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3073         key.offset = (u64)-1;
3074         key.type = BTRFS_CHUNK_ITEM_KEY;
3075
3076         while (1) {
3077                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3078                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3079                 if (ret < 0) {
3080                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3081                         goto error;
3082                 }
3083                 BUG_ON(ret == 0); /* Corruption */
3084
3085                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3086                                           key.type);
3087                 if (ret)
3088                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3089                 if (ret < 0)
3090                         goto error;
3091                 if (ret > 0)
3092                         break;
3093
3094                 leaf = path->nodes[0];
3095                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3096
3097                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3098                                        struct btrfs_chunk);
3099                 chunk_type = btrfs_chunk_type(leaf, chunk);
3100                 btrfs_release_path(path);
3101
3102                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3103                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3104                         if (ret == -ENOSPC)
3105                                 failed++;
3106                         else
3107                                 BUG_ON(ret);
3108                 }
3109                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3110
3111                 if (found_key.offset == 0)
3112                         break;
3113                 key.offset = found_key.offset - 1;
3114         }
3115         ret = 0;
3116         if (failed && !retried) {
3117                 failed = 0;
3118                 retried = true;
3119                 goto again;
3120         } else if (WARN_ON(failed && retried)) {
3121                 ret = -ENOSPC;
3122         }
3123 error:
3124         btrfs_free_path(path);
3125         return ret;
3126 }
3127
3128 /*
3129  * return 1 : allocate a data chunk successfully,
3130  * return <0: errors during allocating a data chunk,
3131  * return 0 : no need to allocate a data chunk.
3132  */
3133 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3134                                       u64 chunk_offset)
3135 {
3136         struct btrfs_block_group_cache *cache;
3137         u64 bytes_used;
3138         u64 chunk_type;
3139
3140         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3141         ASSERT(cache);
3142         chunk_type = cache->flags;
3143         btrfs_put_block_group(cache);
3144
3145         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3146                 spin_lock(&fs_info->data_sinfo->lock);
3147                 bytes_used = fs_info->data_sinfo->bytes_used;
3148                 spin_unlock(&fs_info->data_sinfo->lock);
3149
3150                 if (!bytes_used) {
3151                         struct btrfs_trans_handle *trans;
3152                         int ret;
3153
3154                         trans = btrfs_join_transaction(fs_info->tree_root);
3155                         if (IS_ERR(trans))
3156                                 return PTR_ERR(trans);
3157
3158                         ret = btrfs_force_chunk_alloc(trans, fs_info,
3159                                                       BTRFS_BLOCK_GROUP_DATA);
3160                         btrfs_end_transaction(trans);
3161                         if (ret < 0)
3162                                 return ret;
3163
3164                         btrfs_add_raid_kobjects(fs_info);
3165
3166                         return 1;
3167                 }
3168         }
3169         return 0;
3170 }
3171
3172 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3173                                struct btrfs_balance_control *bctl)
3174 {
3175         struct btrfs_root *root = fs_info->tree_root;
3176         struct btrfs_trans_handle *trans;
3177         struct btrfs_balance_item *item;
3178         struct btrfs_disk_balance_args disk_bargs;
3179         struct btrfs_path *path;
3180         struct extent_buffer *leaf;
3181         struct btrfs_key key;
3182         int ret, err;
3183
3184         path = btrfs_alloc_path();
3185         if (!path)
3186                 return -ENOMEM;
3187
3188         trans = btrfs_start_transaction(root, 0);
3189         if (IS_ERR(trans)) {
3190                 btrfs_free_path(path);
3191                 return PTR_ERR(trans);
3192         }
3193
3194         key.objectid = BTRFS_BALANCE_OBJECTID;
3195         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3196         key.offset = 0;
3197
3198         ret = btrfs_insert_empty_item(trans, root, path, &key,
3199                                       sizeof(*item));
3200         if (ret)
3201                 goto out;
3202
3203         leaf = path->nodes[0];
3204         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3205
3206         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3207
3208         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3209         btrfs_set_balance_data(leaf, item, &disk_bargs);
3210         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3211         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3212         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3213         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3214
3215         btrfs_set_balance_flags(leaf, item, bctl->flags);
3216
3217         btrfs_mark_buffer_dirty(leaf);
3218 out:
3219         btrfs_free_path(path);
3220         err = btrfs_commit_transaction(trans);
3221         if (err && !ret)
3222                 ret = err;
3223         return ret;
3224 }
3225
3226 static int del_balance_item(struct btrfs_fs_info *fs_info)
3227 {
3228         struct btrfs_root *root = fs_info->tree_root;
3229         struct btrfs_trans_handle *trans;
3230         struct btrfs_path *path;
3231         struct btrfs_key key;
3232         int ret, err;
3233
3234         path = btrfs_alloc_path();
3235         if (!path)
3236                 return -ENOMEM;
3237
3238         trans = btrfs_start_transaction(root, 0);
3239         if (IS_ERR(trans)) {
3240                 btrfs_free_path(path);
3241                 return PTR_ERR(trans);
3242         }
3243
3244         key.objectid = BTRFS_BALANCE_OBJECTID;
3245         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3246         key.offset = 0;
3247
3248         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3249</