13bb44194c564459e2114032e524ed606af2d6d5
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include "ctree.h"
18 #include "extent_map.h"
19 #include "disk-io.h"
20 #include "transaction.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "async-thread.h"
25 #include "check-integrity.h"
26 #include "rcu-string.h"
27 #include "math.h"
28 #include "dev-replace.h"
29 #include "sysfs.h"
30
31 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
32         [BTRFS_RAID_RAID10] = {
33                 .sub_stripes    = 2,
34                 .dev_stripes    = 1,
35                 .devs_max       = 0,    /* 0 == as many as possible */
36                 .devs_min       = 4,
37                 .tolerated_failures = 1,
38                 .devs_increment = 2,
39                 .ncopies        = 2,
40                 .nparity        = 0,
41                 .raid_name      = "raid10",
42                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
43                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
44         },
45         [BTRFS_RAID_RAID1] = {
46                 .sub_stripes    = 1,
47                 .dev_stripes    = 1,
48                 .devs_max       = 2,
49                 .devs_min       = 2,
50                 .tolerated_failures = 1,
51                 .devs_increment = 2,
52                 .ncopies        = 2,
53                 .nparity        = 0,
54                 .raid_name      = "raid1",
55                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
56                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
57         },
58         [BTRFS_RAID_DUP] = {
59                 .sub_stripes    = 1,
60                 .dev_stripes    = 2,
61                 .devs_max       = 1,
62                 .devs_min       = 1,
63                 .tolerated_failures = 0,
64                 .devs_increment = 1,
65                 .ncopies        = 2,
66                 .nparity        = 0,
67                 .raid_name      = "dup",
68                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
69                 .mindev_error   = 0,
70         },
71         [BTRFS_RAID_RAID0] = {
72                 .sub_stripes    = 1,
73                 .dev_stripes    = 1,
74                 .devs_max       = 0,
75                 .devs_min       = 2,
76                 .tolerated_failures = 0,
77                 .devs_increment = 1,
78                 .ncopies        = 1,
79                 .nparity        = 0,
80                 .raid_name      = "raid0",
81                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
82                 .mindev_error   = 0,
83         },
84         [BTRFS_RAID_SINGLE] = {
85                 .sub_stripes    = 1,
86                 .dev_stripes    = 1,
87                 .devs_max       = 1,
88                 .devs_min       = 1,
89                 .tolerated_failures = 0,
90                 .devs_increment = 1,
91                 .ncopies        = 1,
92                 .nparity        = 0,
93                 .raid_name      = "single",
94                 .bg_flag        = 0,
95                 .mindev_error   = 0,
96         },
97         [BTRFS_RAID_RAID5] = {
98                 .sub_stripes    = 1,
99                 .dev_stripes    = 1,
100                 .devs_max       = 0,
101                 .devs_min       = 2,
102                 .tolerated_failures = 1,
103                 .devs_increment = 1,
104                 .ncopies        = 1,
105                 .nparity        = 1,
106                 .raid_name      = "raid5",
107                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
108                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
109         },
110         [BTRFS_RAID_RAID6] = {
111                 .sub_stripes    = 1,
112                 .dev_stripes    = 1,
113                 .devs_max       = 0,
114                 .devs_min       = 3,
115                 .tolerated_failures = 2,
116                 .devs_increment = 1,
117                 .ncopies        = 1,
118                 .nparity        = 2,
119                 .raid_name      = "raid6",
120                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
121                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
122         },
123 };
124
125 const char *get_raid_name(enum btrfs_raid_types type)
126 {
127         if (type >= BTRFS_NR_RAID_TYPES)
128                 return NULL;
129
130         return btrfs_raid_array[type].raid_name;
131 }
132
133 static int init_first_rw_device(struct btrfs_trans_handle *trans,
134                                 struct btrfs_fs_info *fs_info);
135 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
136 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
137 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
138 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
139 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
140                              enum btrfs_map_op op,
141                              u64 logical, u64 *length,
142                              struct btrfs_bio **bbio_ret,
143                              int mirror_num, int need_raid_map);
144
145 /*
146  * Device locking
147  * ==============
148  *
149  * There are several mutexes that protect manipulation of devices and low-level
150  * structures like chunks but not block groups, extents or files
151  *
152  * uuid_mutex (global lock)
153  * ------------------------
154  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
155  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
156  * device) or requested by the device= mount option
157  *
158  * the mutex can be very coarse and can cover long-running operations
159  *
160  * protects: updates to fs_devices counters like missing devices, rw devices,
161  * seeding, structure cloning, openning/closing devices at mount/umount time
162  *
163  * global::fs_devs - add, remove, updates to the global list
164  *
165  * does not protect: manipulation of the fs_devices::devices list!
166  *
167  * btrfs_device::name - renames (write side), read is RCU
168  *
169  * fs_devices::device_list_mutex (per-fs, with RCU)
170  * ------------------------------------------------
171  * protects updates to fs_devices::devices, ie. adding and deleting
172  *
173  * simple list traversal with read-only actions can be done with RCU protection
174  *
175  * may be used to exclude some operations from running concurrently without any
176  * modifications to the list (see write_all_supers)
177  *
178  * balance_mutex
179  * -------------
180  * protects balance structures (status, state) and context accessed from
181  * several places (internally, ioctl)
182  *
183  * chunk_mutex
184  * -----------
185  * protects chunks, adding or removing during allocation, trim or when a new
186  * device is added/removed
187  *
188  * cleaner_mutex
189  * -------------
190  * a big lock that is held by the cleaner thread and prevents running subvolume
191  * cleaning together with relocation or delayed iputs
192  *
193  *
194  * Lock nesting
195  * ============
196  *
197  * uuid_mutex
198  *   volume_mutex
199  *     device_list_mutex
200  *       chunk_mutex
201  *     balance_mutex
202  *
203  *
204  * Exclusive operations, BTRFS_FS_EXCL_OP
205  * ======================================
206  *
207  * Maintains the exclusivity of the following operations that apply to the
208  * whole filesystem and cannot run in parallel.
209  *
210  * - Balance (*)
211  * - Device add
212  * - Device remove
213  * - Device replace (*)
214  * - Resize
215  *
216  * The device operations (as above) can be in one of the following states:
217  *
218  * - Running state
219  * - Paused state
220  * - Completed state
221  *
222  * Only device operations marked with (*) can go into the Paused state for the
223  * following reasons:
224  *
225  * - ioctl (only Balance can be Paused through ioctl)
226  * - filesystem remounted as read-only
227  * - filesystem unmounted and mounted as read-only
228  * - system power-cycle and filesystem mounted as read-only
229  * - filesystem or device errors leading to forced read-only
230  *
231  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
232  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
233  * A device operation in Paused or Running state can be canceled or resumed
234  * either by ioctl (Balance only) or when remounted as read-write.
235  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
236  * completed.
237  */
238
239 DEFINE_MUTEX(uuid_mutex);
240 static LIST_HEAD(fs_uuids);
241 struct list_head *btrfs_get_fs_uuids(void)
242 {
243         return &fs_uuids;
244 }
245
246 /*
247  * alloc_fs_devices - allocate struct btrfs_fs_devices
248  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
249  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
250  *
251  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
252  * The returned struct is not linked onto any lists and can be destroyed with
253  * kfree() right away.
254  */
255 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
256                                                  const u8 *metadata_fsid)
257 {
258         struct btrfs_fs_devices *fs_devs;
259
260         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
261         if (!fs_devs)
262                 return ERR_PTR(-ENOMEM);
263
264         mutex_init(&fs_devs->device_list_mutex);
265
266         INIT_LIST_HEAD(&fs_devs->devices);
267         INIT_LIST_HEAD(&fs_devs->resized_devices);
268         INIT_LIST_HEAD(&fs_devs->alloc_list);
269         INIT_LIST_HEAD(&fs_devs->fs_list);
270         if (fsid)
271                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
272
273         if (metadata_fsid)
274                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
275         else if (fsid)
276                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
277
278         return fs_devs;
279 }
280
281 void btrfs_free_device(struct btrfs_device *device)
282 {
283         rcu_string_free(device->name);
284         bio_put(device->flush_bio);
285         kfree(device);
286 }
287
288 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
289 {
290         struct btrfs_device *device;
291         WARN_ON(fs_devices->opened);
292         while (!list_empty(&fs_devices->devices)) {
293                 device = list_entry(fs_devices->devices.next,
294                                     struct btrfs_device, dev_list);
295                 list_del(&device->dev_list);
296                 btrfs_free_device(device);
297         }
298         kfree(fs_devices);
299 }
300
301 static void btrfs_kobject_uevent(struct block_device *bdev,
302                                  enum kobject_action action)
303 {
304         int ret;
305
306         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
307         if (ret)
308                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
309                         action,
310                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
311                         &disk_to_dev(bdev->bd_disk)->kobj);
312 }
313
314 void __exit btrfs_cleanup_fs_uuids(void)
315 {
316         struct btrfs_fs_devices *fs_devices;
317
318         while (!list_empty(&fs_uuids)) {
319                 fs_devices = list_entry(fs_uuids.next,
320                                         struct btrfs_fs_devices, fs_list);
321                 list_del(&fs_devices->fs_list);
322                 free_fs_devices(fs_devices);
323         }
324 }
325
326 /*
327  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
328  * Returned struct is not linked onto any lists and must be destroyed using
329  * btrfs_free_device.
330  */
331 static struct btrfs_device *__alloc_device(void)
332 {
333         struct btrfs_device *dev;
334
335         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
336         if (!dev)
337                 return ERR_PTR(-ENOMEM);
338
339         /*
340          * Preallocate a bio that's always going to be used for flushing device
341          * barriers and matches the device lifespan
342          */
343         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
344         if (!dev->flush_bio) {
345                 kfree(dev);
346                 return ERR_PTR(-ENOMEM);
347         }
348
349         INIT_LIST_HEAD(&dev->dev_list);
350         INIT_LIST_HEAD(&dev->dev_alloc_list);
351         INIT_LIST_HEAD(&dev->resized_list);
352
353         spin_lock_init(&dev->io_lock);
354
355         atomic_set(&dev->reada_in_flight, 0);
356         atomic_set(&dev->dev_stats_ccnt, 0);
357         btrfs_device_data_ordered_init(dev);
358         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
359         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
360
361         return dev;
362 }
363
364 /*
365  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
366  * return NULL.
367  *
368  * If devid and uuid are both specified, the match must be exact, otherwise
369  * only devid is used.
370  */
371 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
372                 u64 devid, const u8 *uuid)
373 {
374         struct btrfs_device *dev;
375
376         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
377                 if (dev->devid == devid &&
378                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
379                         return dev;
380                 }
381         }
382         return NULL;
383 }
384
385 static noinline struct btrfs_fs_devices *find_fsid(
386                 const u8 *fsid, const u8 *metadata_fsid)
387 {
388         struct btrfs_fs_devices *fs_devices;
389
390         ASSERT(fsid);
391
392         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
393                 if (metadata_fsid) {
394                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
395                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
396                                       BTRFS_FSID_SIZE) == 0)
397                                 return fs_devices;
398                 } else {
399                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
400                                 return fs_devices;
401                 }
402         }
403         return NULL;
404 }
405
406 static int
407 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
408                       int flush, struct block_device **bdev,
409                       struct buffer_head **bh)
410 {
411         int ret;
412
413         *bdev = blkdev_get_by_path(device_path, flags, holder);
414
415         if (IS_ERR(*bdev)) {
416                 ret = PTR_ERR(*bdev);
417                 goto error;
418         }
419
420         if (flush)
421                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
422         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
423         if (ret) {
424                 blkdev_put(*bdev, flags);
425                 goto error;
426         }
427         invalidate_bdev(*bdev);
428         *bh = btrfs_read_dev_super(*bdev);
429         if (IS_ERR(*bh)) {
430                 ret = PTR_ERR(*bh);
431                 blkdev_put(*bdev, flags);
432                 goto error;
433         }
434
435         return 0;
436
437 error:
438         *bdev = NULL;
439         *bh = NULL;
440         return ret;
441 }
442
443 static void requeue_list(struct btrfs_pending_bios *pending_bios,
444                         struct bio *head, struct bio *tail)
445 {
446
447         struct bio *old_head;
448
449         old_head = pending_bios->head;
450         pending_bios->head = head;
451         if (pending_bios->tail)
452                 tail->bi_next = old_head;
453         else
454                 pending_bios->tail = tail;
455 }
456
457 /*
458  * we try to collect pending bios for a device so we don't get a large
459  * number of procs sending bios down to the same device.  This greatly
460  * improves the schedulers ability to collect and merge the bios.
461  *
462  * But, it also turns into a long list of bios to process and that is sure
463  * to eventually make the worker thread block.  The solution here is to
464  * make some progress and then put this work struct back at the end of
465  * the list if the block device is congested.  This way, multiple devices
466  * can make progress from a single worker thread.
467  */
468 static noinline void run_scheduled_bios(struct btrfs_device *device)
469 {
470         struct btrfs_fs_info *fs_info = device->fs_info;
471         struct bio *pending;
472         struct backing_dev_info *bdi;
473         struct btrfs_pending_bios *pending_bios;
474         struct bio *tail;
475         struct bio *cur;
476         int again = 0;
477         unsigned long num_run;
478         unsigned long batch_run = 0;
479         unsigned long last_waited = 0;
480         int force_reg = 0;
481         int sync_pending = 0;
482         struct blk_plug plug;
483
484         /*
485          * this function runs all the bios we've collected for
486          * a particular device.  We don't want to wander off to
487          * another device without first sending all of these down.
488          * So, setup a plug here and finish it off before we return
489          */
490         blk_start_plug(&plug);
491
492         bdi = device->bdev->bd_bdi;
493
494 loop:
495         spin_lock(&device->io_lock);
496
497 loop_lock:
498         num_run = 0;
499
500         /* take all the bios off the list at once and process them
501          * later on (without the lock held).  But, remember the
502          * tail and other pointers so the bios can be properly reinserted
503          * into the list if we hit congestion
504          */
505         if (!force_reg && device->pending_sync_bios.head) {
506                 pending_bios = &device->pending_sync_bios;
507                 force_reg = 1;
508         } else {
509                 pending_bios = &device->pending_bios;
510                 force_reg = 0;
511         }
512
513         pending = pending_bios->head;
514         tail = pending_bios->tail;
515         WARN_ON(pending && !tail);
516
517         /*
518          * if pending was null this time around, no bios need processing
519          * at all and we can stop.  Otherwise it'll loop back up again
520          * and do an additional check so no bios are missed.
521          *
522          * device->running_pending is used to synchronize with the
523          * schedule_bio code.
524          */
525         if (device->pending_sync_bios.head == NULL &&
526             device->pending_bios.head == NULL) {
527                 again = 0;
528                 device->running_pending = 0;
529         } else {
530                 again = 1;
531                 device->running_pending = 1;
532         }
533
534         pending_bios->head = NULL;
535         pending_bios->tail = NULL;
536
537         spin_unlock(&device->io_lock);
538
539         while (pending) {
540
541                 rmb();
542                 /* we want to work on both lists, but do more bios on the
543                  * sync list than the regular list
544                  */
545                 if ((num_run > 32 &&
546                     pending_bios != &device->pending_sync_bios &&
547                     device->pending_sync_bios.head) ||
548                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
549                     device->pending_bios.head)) {
550                         spin_lock(&device->io_lock);
551                         requeue_list(pending_bios, pending, tail);
552                         goto loop_lock;
553                 }
554
555                 cur = pending;
556                 pending = pending->bi_next;
557                 cur->bi_next = NULL;
558
559                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
560
561                 /*
562                  * if we're doing the sync list, record that our
563                  * plug has some sync requests on it
564                  *
565                  * If we're doing the regular list and there are
566                  * sync requests sitting around, unplug before
567                  * we add more
568                  */
569                 if (pending_bios == &device->pending_sync_bios) {
570                         sync_pending = 1;
571                 } else if (sync_pending) {
572                         blk_finish_plug(&plug);
573                         blk_start_plug(&plug);
574                         sync_pending = 0;
575                 }
576
577                 btrfsic_submit_bio(cur);
578                 num_run++;
579                 batch_run++;
580
581                 cond_resched();
582
583                 /*
584                  * we made progress, there is more work to do and the bdi
585                  * is now congested.  Back off and let other work structs
586                  * run instead
587                  */
588                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
589                     fs_info->fs_devices->open_devices > 1) {
590                         struct io_context *ioc;
591
592                         ioc = current->io_context;
593
594                         /*
595                          * the main goal here is that we don't want to
596                          * block if we're going to be able to submit
597                          * more requests without blocking.
598                          *
599                          * This code does two great things, it pokes into
600                          * the elevator code from a filesystem _and_
601                          * it makes assumptions about how batching works.
602                          */
603                         if (ioc && ioc->nr_batch_requests > 0 &&
604                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
605                             (last_waited == 0 ||
606                              ioc->last_waited == last_waited)) {
607                                 /*
608                                  * we want to go through our batch of
609                                  * requests and stop.  So, we copy out
610                                  * the ioc->last_waited time and test
611                                  * against it before looping
612                                  */
613                                 last_waited = ioc->last_waited;
614                                 cond_resched();
615                                 continue;
616                         }
617                         spin_lock(&device->io_lock);
618                         requeue_list(pending_bios, pending, tail);
619                         device->running_pending = 1;
620
621                         spin_unlock(&device->io_lock);
622                         btrfs_queue_work(fs_info->submit_workers,
623                                          &device->work);
624                         goto done;
625                 }
626         }
627
628         cond_resched();
629         if (again)
630                 goto loop;
631
632         spin_lock(&device->io_lock);
633         if (device->pending_bios.head || device->pending_sync_bios.head)
634                 goto loop_lock;
635         spin_unlock(&device->io_lock);
636
637 done:
638         blk_finish_plug(&plug);
639 }
640
641 static void pending_bios_fn(struct btrfs_work *work)
642 {
643         struct btrfs_device *device;
644
645         device = container_of(work, struct btrfs_device, work);
646         run_scheduled_bios(device);
647 }
648
649 /*
650  *  Search and remove all stale (devices which are not mounted) devices.
651  *  When both inputs are NULL, it will search and release all stale devices.
652  *  path:       Optional. When provided will it release all unmounted devices
653  *              matching this path only.
654  *  skip_dev:   Optional. Will skip this device when searching for the stale
655  *              devices.
656  */
657 static void btrfs_free_stale_devices(const char *path,
658                                      struct btrfs_device *skip_device)
659 {
660         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
661         struct btrfs_device *device, *tmp_device;
662
663         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
664                 mutex_lock(&fs_devices->device_list_mutex);
665                 if (fs_devices->opened) {
666                         mutex_unlock(&fs_devices->device_list_mutex);
667                         continue;
668                 }
669
670                 list_for_each_entry_safe(device, tmp_device,
671                                          &fs_devices->devices, dev_list) {
672                         int not_found = 0;
673
674                         if (skip_device && skip_device == device)
675                                 continue;
676                         if (path && !device->name)
677                                 continue;
678
679                         rcu_read_lock();
680                         if (path)
681                                 not_found = strcmp(rcu_str_deref(device->name),
682                                                    path);
683                         rcu_read_unlock();
684                         if (not_found)
685                                 continue;
686
687                         /* delete the stale device */
688                         fs_devices->num_devices--;
689                         list_del(&device->dev_list);
690                         btrfs_free_device(device);
691
692                         if (fs_devices->num_devices == 0)
693                                 break;
694                 }
695                 mutex_unlock(&fs_devices->device_list_mutex);
696                 if (fs_devices->num_devices == 0) {
697                         btrfs_sysfs_remove_fsid(fs_devices);
698                         list_del(&fs_devices->fs_list);
699                         free_fs_devices(fs_devices);
700                 }
701         }
702 }
703
704 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
705                         struct btrfs_device *device, fmode_t flags,
706                         void *holder)
707 {
708         struct request_queue *q;
709         struct block_device *bdev;
710         struct buffer_head *bh;
711         struct btrfs_super_block *disk_super;
712         u64 devid;
713         int ret;
714
715         if (device->bdev)
716                 return -EINVAL;
717         if (!device->name)
718                 return -EINVAL;
719
720         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
721                                     &bdev, &bh);
722         if (ret)
723                 return ret;
724
725         disk_super = (struct btrfs_super_block *)bh->b_data;
726         devid = btrfs_stack_device_id(&disk_super->dev_item);
727         if (devid != device->devid)
728                 goto error_brelse;
729
730         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
731                 goto error_brelse;
732
733         device->generation = btrfs_super_generation(disk_super);
734
735         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
736                 if (btrfs_super_incompat_flags(disk_super) &
737                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
738                         pr_err(
739                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
740                         goto error_brelse;
741                 }
742
743                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
744                 fs_devices->seeding = 1;
745         } else {
746                 if (bdev_read_only(bdev))
747                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
748                 else
749                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
750         }
751
752         q = bdev_get_queue(bdev);
753         if (!blk_queue_nonrot(q))
754                 fs_devices->rotating = 1;
755
756         device->bdev = bdev;
757         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
758         device->mode = flags;
759
760         fs_devices->open_devices++;
761         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
762             device->devid != BTRFS_DEV_REPLACE_DEVID) {
763                 fs_devices->rw_devices++;
764                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
765         }
766         brelse(bh);
767
768         return 0;
769
770 error_brelse:
771         brelse(bh);
772         blkdev_put(bdev, flags);
773
774         return -EINVAL;
775 }
776
777 /*
778  * Add new device to list of registered devices
779  *
780  * Returns:
781  * device pointer which was just added or updated when successful
782  * error pointer when failed
783  */
784 static noinline struct btrfs_device *device_list_add(const char *path,
785                            struct btrfs_super_block *disk_super,
786                            bool *new_device_added)
787 {
788         struct btrfs_device *device;
789         struct btrfs_fs_devices *fs_devices;
790         struct rcu_string *name;
791         u64 found_transid = btrfs_super_generation(disk_super);
792         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
793         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
794                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
795         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
796                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
797
798         if (has_metadata_uuid)
799                 fs_devices = find_fsid(disk_super->fsid, disk_super->metadata_uuid);
800         else
801                 fs_devices = find_fsid(disk_super->fsid, NULL);
802
803         if (!fs_devices) {
804                 if (has_metadata_uuid)
805                         fs_devices = alloc_fs_devices(disk_super->fsid,
806                                                       disk_super->metadata_uuid);
807                 else
808                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
809
810                 fs_devices->fsid_change = fsid_change_in_progress;
811
812                 if (IS_ERR(fs_devices))
813                         return ERR_CAST(fs_devices);
814
815                 mutex_lock(&fs_devices->device_list_mutex);
816                 list_add(&fs_devices->fs_list, &fs_uuids);
817
818                 device = NULL;
819         } else {
820                 mutex_lock(&fs_devices->device_list_mutex);
821                 device = find_device(fs_devices, devid,
822                                 disk_super->dev_item.uuid);
823         }
824
825         if (!device) {
826                 if (fs_devices->opened) {
827                         mutex_unlock(&fs_devices->device_list_mutex);
828                         return ERR_PTR(-EBUSY);
829                 }
830
831                 device = btrfs_alloc_device(NULL, &devid,
832                                             disk_super->dev_item.uuid);
833                 if (IS_ERR(device)) {
834                         mutex_unlock(&fs_devices->device_list_mutex);
835                         /* we can safely leave the fs_devices entry around */
836                         return device;
837                 }
838
839                 name = rcu_string_strdup(path, GFP_NOFS);
840                 if (!name) {
841                         btrfs_free_device(device);
842                         mutex_unlock(&fs_devices->device_list_mutex);
843                         return ERR_PTR(-ENOMEM);
844                 }
845                 rcu_assign_pointer(device->name, name);
846
847                 list_add_rcu(&device->dev_list, &fs_devices->devices);
848                 fs_devices->num_devices++;
849
850                 device->fs_devices = fs_devices;
851                 *new_device_added = true;
852
853                 if (disk_super->label[0])
854                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
855                                 disk_super->label, devid, found_transid, path);
856                 else
857                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
858                                 disk_super->fsid, devid, found_transid, path);
859
860         } else if (!device->name || strcmp(device->name->str, path)) {
861                 /*
862                  * When FS is already mounted.
863                  * 1. If you are here and if the device->name is NULL that
864                  *    means this device was missing at time of FS mount.
865                  * 2. If you are here and if the device->name is different
866                  *    from 'path' that means either
867                  *      a. The same device disappeared and reappeared with
868                  *         different name. or
869                  *      b. The missing-disk-which-was-replaced, has
870                  *         reappeared now.
871                  *
872                  * We must allow 1 and 2a above. But 2b would be a spurious
873                  * and unintentional.
874                  *
875                  * Further in case of 1 and 2a above, the disk at 'path'
876                  * would have missed some transaction when it was away and
877                  * in case of 2a the stale bdev has to be updated as well.
878                  * 2b must not be allowed at all time.
879                  */
880
881                 /*
882                  * For now, we do allow update to btrfs_fs_device through the
883                  * btrfs dev scan cli after FS has been mounted.  We're still
884                  * tracking a problem where systems fail mount by subvolume id
885                  * when we reject replacement on a mounted FS.
886                  */
887                 if (!fs_devices->opened && found_transid < device->generation) {
888                         /*
889                          * That is if the FS is _not_ mounted and if you
890                          * are here, that means there is more than one
891                          * disk with same uuid and devid.We keep the one
892                          * with larger generation number or the last-in if
893                          * generation are equal.
894                          */
895                         mutex_unlock(&fs_devices->device_list_mutex);
896                         return ERR_PTR(-EEXIST);
897                 }
898
899                 /*
900                  * We are going to replace the device path for a given devid,
901                  * make sure it's the same device if the device is mounted
902                  */
903                 if (device->bdev) {
904                         struct block_device *path_bdev;
905
906                         path_bdev = lookup_bdev(path);
907                         if (IS_ERR(path_bdev)) {
908                                 mutex_unlock(&fs_devices->device_list_mutex);
909                                 return ERR_CAST(path_bdev);
910                         }
911
912                         if (device->bdev != path_bdev) {
913                                 bdput(path_bdev);
914                                 mutex_unlock(&fs_devices->device_list_mutex);
915                                 btrfs_warn_in_rcu(device->fs_info,
916                         "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
917                                         disk_super->fsid, devid,
918                                         rcu_str_deref(device->name), path);
919                                 return ERR_PTR(-EEXIST);
920                         }
921                         bdput(path_bdev);
922                         btrfs_info_in_rcu(device->fs_info,
923                                 "device fsid %pU devid %llu moved old:%s new:%s",
924                                 disk_super->fsid, devid,
925                                 rcu_str_deref(device->name), path);
926                 }
927
928                 name = rcu_string_strdup(path, GFP_NOFS);
929                 if (!name) {
930                         mutex_unlock(&fs_devices->device_list_mutex);
931                         return ERR_PTR(-ENOMEM);
932                 }
933                 rcu_string_free(device->name);
934                 rcu_assign_pointer(device->name, name);
935                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
936                         fs_devices->missing_devices--;
937                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
938                 }
939         }
940
941         /*
942          * Unmount does not free the btrfs_device struct but would zero
943          * generation along with most of the other members. So just update
944          * it back. We need it to pick the disk with largest generation
945          * (as above).
946          */
947         if (!fs_devices->opened) {
948                 device->generation = found_transid;
949                 fs_devices->latest_generation = max_t(u64, found_transid,
950                                                 fs_devices->latest_generation);
951         }
952
953         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
954
955         mutex_unlock(&fs_devices->device_list_mutex);
956         return device;
957 }
958
959 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
960 {
961         struct btrfs_fs_devices *fs_devices;
962         struct btrfs_device *device;
963         struct btrfs_device *orig_dev;
964
965         fs_devices = alloc_fs_devices(orig->fsid, NULL);
966         if (IS_ERR(fs_devices))
967                 return fs_devices;
968
969         mutex_lock(&orig->device_list_mutex);
970         fs_devices->total_devices = orig->total_devices;
971
972         /* We have held the volume lock, it is safe to get the devices. */
973         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
974                 struct rcu_string *name;
975
976                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
977                                             orig_dev->uuid);
978                 if (IS_ERR(device))
979                         goto error;
980
981                 /*
982                  * This is ok to do without rcu read locked because we hold the
983                  * uuid mutex so nothing we touch in here is going to disappear.
984                  */
985                 if (orig_dev->name) {
986                         name = rcu_string_strdup(orig_dev->name->str,
987                                         GFP_KERNEL);
988                         if (!name) {
989                                 btrfs_free_device(device);
990                                 goto error;
991                         }
992                         rcu_assign_pointer(device->name, name);
993                 }
994
995                 list_add(&device->dev_list, &fs_devices->devices);
996                 device->fs_devices = fs_devices;
997                 fs_devices->num_devices++;
998         }
999         mutex_unlock(&orig->device_list_mutex);
1000         return fs_devices;
1001 error:
1002         mutex_unlock(&orig->device_list_mutex);
1003         free_fs_devices(fs_devices);
1004         return ERR_PTR(-ENOMEM);
1005 }
1006
1007 /*
1008  * After we have read the system tree and know devids belonging to
1009  * this filesystem, remove the device which does not belong there.
1010  */
1011 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1012 {
1013         struct btrfs_device *device, *next;
1014         struct btrfs_device *latest_dev = NULL;
1015
1016         mutex_lock(&uuid_mutex);
1017 again:
1018         /* This is the initialized path, it is safe to release the devices. */
1019         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1020                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
1021                                                         &device->dev_state)) {
1022                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1023                              &device->dev_state) &&
1024                              (!latest_dev ||
1025                               device->generation > latest_dev->generation)) {
1026                                 latest_dev = device;
1027                         }
1028                         continue;
1029                 }
1030
1031                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
1032                         /*
1033                          * In the first step, keep the device which has
1034                          * the correct fsid and the devid that is used
1035                          * for the dev_replace procedure.
1036                          * In the second step, the dev_replace state is
1037                          * read from the device tree and it is known
1038                          * whether the procedure is really active or
1039                          * not, which means whether this device is
1040                          * used or whether it should be removed.
1041                          */
1042                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1043                                                   &device->dev_state)) {
1044                                 continue;
1045                         }
1046                 }
1047                 if (device->bdev) {
1048                         blkdev_put(device->bdev, device->mode);
1049                         device->bdev = NULL;
1050                         fs_devices->open_devices--;
1051                 }
1052                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1053                         list_del_init(&device->dev_alloc_list);
1054                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1055                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1056                                       &device->dev_state))
1057                                 fs_devices->rw_devices--;
1058                 }
1059                 list_del_init(&device->dev_list);
1060                 fs_devices->num_devices--;
1061                 btrfs_free_device(device);
1062         }
1063
1064         if (fs_devices->seed) {
1065                 fs_devices = fs_devices->seed;
1066                 goto again;
1067         }
1068
1069         fs_devices->latest_bdev = latest_dev->bdev;
1070
1071         mutex_unlock(&uuid_mutex);
1072 }
1073
1074 static void free_device_rcu(struct rcu_head *head)
1075 {
1076         struct btrfs_device *device;
1077
1078         device = container_of(head, struct btrfs_device, rcu);
1079         btrfs_free_device(device);
1080 }
1081
1082 static void btrfs_close_bdev(struct btrfs_device *device)
1083 {
1084         if (!device->bdev)
1085                 return;
1086
1087         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1088                 sync_blockdev(device->bdev);
1089                 invalidate_bdev(device->bdev);
1090         }
1091
1092         blkdev_put(device->bdev, device->mode);
1093 }
1094
1095 static void btrfs_close_one_device(struct btrfs_device *device)
1096 {
1097         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1098         struct btrfs_device *new_device;
1099         struct rcu_string *name;
1100
1101         if (device->bdev)
1102                 fs_devices->open_devices--;
1103
1104         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1105             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1106                 list_del_init(&device->dev_alloc_list);
1107                 fs_devices->rw_devices--;
1108         }
1109
1110         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1111                 fs_devices->missing_devices--;
1112
1113         btrfs_close_bdev(device);
1114
1115         new_device = btrfs_alloc_device(NULL, &device->devid,
1116                                         device->uuid);
1117         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1118
1119         /* Safe because we are under uuid_mutex */
1120         if (device->name) {
1121                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1122                 BUG_ON(!name); /* -ENOMEM */
1123                 rcu_assign_pointer(new_device->name, name);
1124         }
1125
1126         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1127         new_device->fs_devices = device->fs_devices;
1128
1129         call_rcu(&device->rcu, free_device_rcu);
1130 }
1131
1132 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1133 {
1134         struct btrfs_device *device, *tmp;
1135
1136         if (--fs_devices->opened > 0)
1137                 return 0;
1138
1139         mutex_lock(&fs_devices->device_list_mutex);
1140         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1141                 btrfs_close_one_device(device);
1142         }
1143         mutex_unlock(&fs_devices->device_list_mutex);
1144
1145         WARN_ON(fs_devices->open_devices);
1146         WARN_ON(fs_devices->rw_devices);
1147         fs_devices->opened = 0;
1148         fs_devices->seeding = 0;
1149
1150         return 0;
1151 }
1152
1153 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1154 {
1155         struct btrfs_fs_devices *seed_devices = NULL;
1156         int ret;
1157
1158         mutex_lock(&uuid_mutex);
1159         ret = close_fs_devices(fs_devices);
1160         if (!fs_devices->opened) {
1161                 seed_devices = fs_devices->seed;
1162                 fs_devices->seed = NULL;
1163         }
1164         mutex_unlock(&uuid_mutex);
1165
1166         while (seed_devices) {
1167                 fs_devices = seed_devices;
1168                 seed_devices = fs_devices->seed;
1169                 close_fs_devices(fs_devices);
1170                 free_fs_devices(fs_devices);
1171         }
1172         return ret;
1173 }
1174
1175 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1176                                 fmode_t flags, void *holder)
1177 {
1178         struct btrfs_device *device;
1179         struct btrfs_device *latest_dev = NULL;
1180         int ret = 0;
1181
1182         flags |= FMODE_EXCL;
1183
1184         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1185                 /* Just open everything we can; ignore failures here */
1186                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1187                         continue;
1188
1189                 if (!latest_dev ||
1190                     device->generation > latest_dev->generation)
1191                         latest_dev = device;
1192         }
1193         if (fs_devices->open_devices == 0) {
1194                 ret = -EINVAL;
1195                 goto out;
1196         }
1197         fs_devices->opened = 1;
1198         fs_devices->latest_bdev = latest_dev->bdev;
1199         fs_devices->total_rw_bytes = 0;
1200 out:
1201         return ret;
1202 }
1203
1204 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1205 {
1206         struct btrfs_device *dev1, *dev2;
1207
1208         dev1 = list_entry(a, struct btrfs_device, dev_list);
1209         dev2 = list_entry(b, struct btrfs_device, dev_list);
1210
1211         if (dev1->devid < dev2->devid)
1212                 return -1;
1213         else if (dev1->devid > dev2->devid)
1214                 return 1;
1215         return 0;
1216 }
1217
1218 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1219                        fmode_t flags, void *holder)
1220 {
1221         int ret;
1222
1223         lockdep_assert_held(&uuid_mutex);
1224
1225         mutex_lock(&fs_devices->device_list_mutex);
1226         if (fs_devices->opened) {
1227                 fs_devices->opened++;
1228                 ret = 0;
1229         } else {
1230                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1231                 ret = open_fs_devices(fs_devices, flags, holder);
1232         }
1233         mutex_unlock(&fs_devices->device_list_mutex);
1234
1235         return ret;
1236 }
1237
1238 static void btrfs_release_disk_super(struct page *page)
1239 {
1240         kunmap(page);
1241         put_page(page);
1242 }
1243
1244 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1245                                  struct page **page,
1246                                  struct btrfs_super_block **disk_super)
1247 {
1248         void *p;
1249         pgoff_t index;
1250
1251         /* make sure our super fits in the device */
1252         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1253                 return 1;
1254
1255         /* make sure our super fits in the page */
1256         if (sizeof(**disk_super) > PAGE_SIZE)
1257                 return 1;
1258
1259         /* make sure our super doesn't straddle pages on disk */
1260         index = bytenr >> PAGE_SHIFT;
1261         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1262                 return 1;
1263
1264         /* pull in the page with our super */
1265         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1266                                    index, GFP_KERNEL);
1267
1268         if (IS_ERR_OR_NULL(*page))
1269                 return 1;
1270
1271         p = kmap(*page);
1272
1273         /* align our pointer to the offset of the super block */
1274         *disk_super = p + (bytenr & ~PAGE_MASK);
1275
1276         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1277             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1278                 btrfs_release_disk_super(*page);
1279                 return 1;
1280         }
1281
1282         if ((*disk_super)->label[0] &&
1283                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1284                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1285
1286         return 0;
1287 }
1288
1289 /*
1290  * Look for a btrfs signature on a device. This may be called out of the mount path
1291  * and we are not allowed to call set_blocksize during the scan. The superblock
1292  * is read via pagecache
1293  */
1294 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1295                                            void *holder)
1296 {
1297         struct btrfs_super_block *disk_super;
1298         bool new_device_added = false;
1299         struct btrfs_device *device = NULL;
1300         struct block_device *bdev;
1301         struct page *page;
1302         u64 bytenr;
1303
1304         lockdep_assert_held(&uuid_mutex);
1305
1306         /*
1307          * we would like to check all the supers, but that would make
1308          * a btrfs mount succeed after a mkfs from a different FS.
1309          * So, we need to add a special mount option to scan for
1310          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1311          */
1312         bytenr = btrfs_sb_offset(0);
1313         flags |= FMODE_EXCL;
1314
1315         bdev = blkdev_get_by_path(path, flags, holder);
1316         if (IS_ERR(bdev))
1317                 return ERR_CAST(bdev);
1318
1319         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1320                 device = ERR_PTR(-EINVAL);
1321                 goto error_bdev_put;
1322         }
1323
1324         device = device_list_add(path, disk_super, &new_device_added);
1325         if (!IS_ERR(device)) {
1326                 if (new_device_added)
1327                         btrfs_free_stale_devices(path, device);
1328         }
1329
1330         btrfs_release_disk_super(page);
1331
1332 error_bdev_put:
1333         blkdev_put(bdev, flags);
1334
1335         return device;
1336 }
1337
1338 static int contains_pending_extent(struct btrfs_transaction *transaction,
1339                                    struct btrfs_device *device,
1340                                    u64 *start, u64 len)
1341 {
1342         struct btrfs_fs_info *fs_info = device->fs_info;
1343         struct extent_map *em;
1344         struct list_head *search_list = &fs_info->pinned_chunks;
1345         int ret = 0;
1346         u64 physical_start = *start;
1347
1348         if (transaction)
1349                 search_list = &transaction->pending_chunks;
1350 again:
1351         list_for_each_entry(em, search_list, list) {
1352                 struct map_lookup *map;
1353                 int i;
1354
1355                 map = em->map_lookup;
1356                 for (i = 0; i < map->num_stripes; i++) {
1357                         u64 end;
1358
1359                         if (map->stripes[i].dev != device)
1360                                 continue;
1361                         if (map->stripes[i].physical >= physical_start + len ||
1362                             map->stripes[i].physical + em->orig_block_len <=
1363                             physical_start)
1364                                 continue;
1365                         /*
1366                          * Make sure that while processing the pinned list we do
1367                          * not override our *start with a lower value, because
1368                          * we can have pinned chunks that fall within this
1369                          * device hole and that have lower physical addresses
1370                          * than the pending chunks we processed before. If we
1371                          * do not take this special care we can end up getting
1372                          * 2 pending chunks that start at the same physical
1373                          * device offsets because the end offset of a pinned
1374                          * chunk can be equal to the start offset of some
1375                          * pending chunk.
1376                          */
1377                         end = map->stripes[i].physical + em->orig_block_len;
1378                         if (end > *start) {
1379                                 *start = end;
1380                                 ret = 1;
1381                         }
1382                 }
1383         }
1384         if (search_list != &fs_info->pinned_chunks) {
1385                 search_list = &fs_info->pinned_chunks;
1386                 goto again;
1387         }
1388
1389         return ret;
1390 }
1391
1392
1393 /*
1394  * find_free_dev_extent_start - find free space in the specified device
1395  * @device:       the device which we search the free space in
1396  * @num_bytes:    the size of the free space that we need
1397  * @search_start: the position from which to begin the search
1398  * @start:        store the start of the free space.
1399  * @len:          the size of the free space. that we find, or the size
1400  *                of the max free space if we don't find suitable free space
1401  *
1402  * this uses a pretty simple search, the expectation is that it is
1403  * called very infrequently and that a given device has a small number
1404  * of extents
1405  *
1406  * @start is used to store the start of the free space if we find. But if we
1407  * don't find suitable free space, it will be used to store the start position
1408  * of the max free space.
1409  *
1410  * @len is used to store the size of the free space that we find.
1411  * But if we don't find suitable free space, it is used to store the size of
1412  * the max free space.
1413  */
1414 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1415                                struct btrfs_device *device, u64 num_bytes,
1416                                u64 search_start, u64 *start, u64 *len)
1417 {
1418         struct btrfs_fs_info *fs_info = device->fs_info;
1419         struct btrfs_root *root = fs_info->dev_root;
1420         struct btrfs_key key;
1421         struct btrfs_dev_extent *dev_extent;
1422         struct btrfs_path *path;
1423         u64 hole_size;
1424         u64 max_hole_start;
1425         u64 max_hole_size;
1426         u64 extent_end;
1427         u64 search_end = device->total_bytes;
1428         int ret;
1429         int slot;
1430         struct extent_buffer *l;
1431
1432         /*
1433          * We don't want to overwrite the superblock on the drive nor any area
1434          * used by the boot loader (grub for example), so we make sure to start
1435          * at an offset of at least 1MB.
1436          */
1437         search_start = max_t(u64, search_start, SZ_1M);
1438
1439         path = btrfs_alloc_path();
1440         if (!path)
1441                 return -ENOMEM;
1442
1443         max_hole_start = search_start;
1444         max_hole_size = 0;
1445
1446 again:
1447         if (search_start >= search_end ||
1448                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1449                 ret = -ENOSPC;
1450                 goto out;
1451         }
1452
1453         path->reada = READA_FORWARD;
1454         path->search_commit_root = 1;
1455         path->skip_locking = 1;
1456
1457         key.objectid = device->devid;
1458         key.offset = search_start;
1459         key.type = BTRFS_DEV_EXTENT_KEY;
1460
1461         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1462         if (ret < 0)
1463                 goto out;
1464         if (ret > 0) {
1465                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1466                 if (ret < 0)
1467                         goto out;
1468         }
1469
1470         while (1) {
1471                 l = path->nodes[0];
1472                 slot = path->slots[0];
1473                 if (slot >= btrfs_header_nritems(l)) {
1474                         ret = btrfs_next_leaf(root, path);
1475                         if (ret == 0)
1476                                 continue;
1477                         if (ret < 0)
1478                                 goto out;
1479
1480                         break;
1481                 }
1482                 btrfs_item_key_to_cpu(l, &key, slot);
1483
1484                 if (key.objectid < device->devid)
1485                         goto next;
1486
1487                 if (key.objectid > device->devid)
1488                         break;
1489
1490                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1491                         goto next;
1492
1493                 if (key.offset > search_start) {
1494                         hole_size = key.offset - search_start;
1495
1496                         /*
1497                          * Have to check before we set max_hole_start, otherwise
1498                          * we could end up sending back this offset anyway.
1499                          */
1500                         if (contains_pending_extent(transaction, device,
1501                                                     &search_start,
1502                                                     hole_size)) {
1503                                 if (key.offset >= search_start) {
1504                                         hole_size = key.offset - search_start;
1505                                 } else {
1506                                         WARN_ON_ONCE(1);
1507                                         hole_size = 0;
1508                                 }
1509                         }
1510
1511                         if (hole_size > max_hole_size) {
1512                                 max_hole_start = search_start;
1513                                 max_hole_size = hole_size;
1514                         }
1515
1516                         /*
1517                          * If this free space is greater than which we need,
1518                          * it must be the max free space that we have found
1519                          * until now, so max_hole_start must point to the start
1520                          * of this free space and the length of this free space
1521                          * is stored in max_hole_size. Thus, we return
1522                          * max_hole_start and max_hole_size and go back to the
1523                          * caller.
1524                          */
1525                         if (hole_size >= num_bytes) {
1526                                 ret = 0;
1527                                 goto out;
1528                         }
1529                 }
1530
1531                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1532                 extent_end = key.offset + btrfs_dev_extent_length(l,
1533                                                                   dev_extent);
1534                 if (extent_end > search_start)
1535                         search_start = extent_end;
1536 next:
1537                 path->slots[0]++;
1538                 cond_resched();
1539         }
1540
1541         /*
1542          * At this point, search_start should be the end of
1543          * allocated dev extents, and when shrinking the device,
1544          * search_end may be smaller than search_start.
1545          */
1546         if (search_end > search_start) {
1547                 hole_size = search_end - search_start;
1548
1549                 if (contains_pending_extent(transaction, device, &search_start,
1550                                             hole_size)) {
1551                         btrfs_release_path(path);
1552                         goto again;
1553                 }
1554
1555                 if (hole_size > max_hole_size) {
1556                         max_hole_start = search_start;
1557                         max_hole_size = hole_size;
1558                 }
1559         }
1560
1561         /* See above. */
1562         if (max_hole_size < num_bytes)
1563                 ret = -ENOSPC;
1564         else
1565                 ret = 0;
1566
1567 out:
1568         btrfs_free_path(path);
1569         *start = max_hole_start;
1570         if (len)
1571                 *len = max_hole_size;
1572         return ret;
1573 }
1574
1575 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1576                          struct btrfs_device *device, u64 num_bytes,
1577                          u64 *start, u64 *len)
1578 {
1579         /* FIXME use last free of some kind */
1580         return find_free_dev_extent_start(trans->transaction, device,
1581                                           num_bytes, 0, start, len);
1582 }
1583
1584 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1585                           struct btrfs_device *device,
1586                           u64 start, u64 *dev_extent_len)
1587 {
1588         struct btrfs_fs_info *fs_info = device->fs_info;
1589         struct btrfs_root *root = fs_info->dev_root;
1590         int ret;
1591         struct btrfs_path *path;
1592         struct btrfs_key key;
1593         struct btrfs_key found_key;
1594         struct extent_buffer *leaf = NULL;
1595         struct btrfs_dev_extent *extent = NULL;
1596
1597         path = btrfs_alloc_path();
1598         if (!path)
1599                 return -ENOMEM;
1600
1601         key.objectid = device->devid;
1602         key.offset = start;
1603         key.type = BTRFS_DEV_EXTENT_KEY;
1604 again:
1605         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1606         if (ret > 0) {
1607                 ret = btrfs_previous_item(root, path, key.objectid,
1608                                           BTRFS_DEV_EXTENT_KEY);
1609                 if (ret)
1610                         goto out;
1611                 leaf = path->nodes[0];
1612                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1613                 extent = btrfs_item_ptr(leaf, path->slots[0],
1614                                         struct btrfs_dev_extent);
1615                 BUG_ON(found_key.offset > start || found_key.offset +
1616                        btrfs_dev_extent_length(leaf, extent) < start);
1617                 key = found_key;
1618                 btrfs_release_path(path);
1619                 goto again;
1620         } else if (ret == 0) {
1621                 leaf = path->nodes[0];
1622                 extent = btrfs_item_ptr(leaf, path->slots[0],
1623                                         struct btrfs_dev_extent);
1624         } else {
1625                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1626                 goto out;
1627         }
1628
1629         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1630
1631         ret = btrfs_del_item(trans, root, path);
1632         if (ret) {
1633                 btrfs_handle_fs_error(fs_info, ret,
1634                                       "Failed to remove dev extent item");
1635         } else {
1636                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1637         }
1638 out:
1639         btrfs_free_path(path);
1640         return ret;
1641 }
1642
1643 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1644                                   struct btrfs_device *device,
1645                                   u64 chunk_offset, u64 start, u64 num_bytes)
1646 {
1647         int ret;
1648         struct btrfs_path *path;
1649         struct btrfs_fs_info *fs_info = device->fs_info;
1650         struct btrfs_root *root = fs_info->dev_root;
1651         struct btrfs_dev_extent *extent;
1652         struct extent_buffer *leaf;
1653         struct btrfs_key key;
1654
1655         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1656         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1657         path = btrfs_alloc_path();
1658         if (!path)
1659                 return -ENOMEM;
1660
1661         key.objectid = device->devid;
1662         key.offset = start;
1663         key.type = BTRFS_DEV_EXTENT_KEY;
1664         ret = btrfs_insert_empty_item(trans, root, path, &key,
1665                                       sizeof(*extent));
1666         if (ret)
1667                 goto out;
1668
1669         leaf = path->nodes[0];
1670         extent = btrfs_item_ptr(leaf, path->slots[0],
1671                                 struct btrfs_dev_extent);
1672         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1673                                         BTRFS_CHUNK_TREE_OBJECTID);
1674         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1675                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1676         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1677
1678         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1679         btrfs_mark_buffer_dirty(leaf);
1680 out:
1681         btrfs_free_path(path);
1682         return ret;
1683 }
1684
1685 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1686 {
1687         struct extent_map_tree *em_tree;
1688         struct extent_map *em;
1689         struct rb_node *n;
1690         u64 ret = 0;
1691
1692         em_tree = &fs_info->mapping_tree.map_tree;
1693         read_lock(&em_tree->lock);
1694         n = rb_last(&em_tree->map.rb_root);
1695         if (n) {
1696                 em = rb_entry(n, struct extent_map, rb_node);
1697                 ret = em->start + em->len;
1698         }
1699         read_unlock(&em_tree->lock);
1700
1701         return ret;
1702 }
1703
1704 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1705                                     u64 *devid_ret)
1706 {
1707         int ret;
1708         struct btrfs_key key;
1709         struct btrfs_key found_key;
1710         struct btrfs_path *path;
1711
1712         path = btrfs_alloc_path();
1713         if (!path)
1714                 return -ENOMEM;
1715
1716         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1717         key.type = BTRFS_DEV_ITEM_KEY;
1718         key.offset = (u64)-1;
1719
1720         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1721         if (ret < 0)
1722                 goto error;
1723
1724         BUG_ON(ret == 0); /* Corruption */
1725
1726         ret = btrfs_previous_item(fs_info->chunk_root, path,
1727                                   BTRFS_DEV_ITEMS_OBJECTID,
1728                                   BTRFS_DEV_ITEM_KEY);
1729         if (ret) {
1730                 *devid_ret = 1;
1731         } else {
1732                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1733                                       path->slots[0]);
1734                 *devid_ret = found_key.offset + 1;
1735         }
1736         ret = 0;
1737 error:
1738         btrfs_free_path(path);
1739         return ret;
1740 }
1741
1742 /*
1743  * the device information is stored in the chunk root
1744  * the btrfs_device struct should be fully filled in
1745  */
1746 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1747                             struct btrfs_device *device)
1748 {
1749         int ret;
1750         struct btrfs_path *path;
1751         struct btrfs_dev_item *dev_item;
1752         struct extent_buffer *leaf;
1753         struct btrfs_key key;
1754         unsigned long ptr;
1755
1756         path = btrfs_alloc_path();
1757         if (!path)
1758                 return -ENOMEM;
1759
1760         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1761         key.type = BTRFS_DEV_ITEM_KEY;
1762         key.offset = device->devid;
1763
1764         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1765                                       &key, sizeof(*dev_item));
1766         if (ret)
1767                 goto out;
1768
1769         leaf = path->nodes[0];
1770         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1771
1772         btrfs_set_device_id(leaf, dev_item, device->devid);
1773         btrfs_set_device_generation(leaf, dev_item, 0);
1774         btrfs_set_device_type(leaf, dev_item, device->type);
1775         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1776         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1777         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1778         btrfs_set_device_total_bytes(leaf, dev_item,
1779                                      btrfs_device_get_disk_total_bytes(device));
1780         btrfs_set_device_bytes_used(leaf, dev_item,
1781                                     btrfs_device_get_bytes_used(device));
1782         btrfs_set_device_group(leaf, dev_item, 0);
1783         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1784         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1785         btrfs_set_device_start_offset(leaf, dev_item, 0);
1786
1787         ptr = btrfs_device_uuid(dev_item);
1788         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1789         ptr = btrfs_device_fsid(dev_item);
1790         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1791                             ptr, BTRFS_FSID_SIZE);
1792         btrfs_mark_buffer_dirty(leaf);
1793
1794         ret = 0;
1795 out:
1796         btrfs_free_path(path);
1797         return ret;
1798 }
1799
1800 /*
1801  * Function to update ctime/mtime for a given device path.
1802  * Mainly used for ctime/mtime based probe like libblkid.
1803  */
1804 static void update_dev_time(const char *path_name)
1805 {
1806         struct file *filp;
1807
1808         filp = filp_open(path_name, O_RDWR, 0);
1809         if (IS_ERR(filp))
1810                 return;
1811         file_update_time(filp);
1812         filp_close(filp, NULL);
1813 }
1814
1815 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1816                              struct btrfs_device *device)
1817 {
1818         struct btrfs_root *root = fs_info->chunk_root;
1819         int ret;
1820         struct btrfs_path *path;
1821         struct btrfs_key key;
1822         struct btrfs_trans_handle *trans;
1823
1824         path = btrfs_alloc_path();
1825         if (!path)
1826                 return -ENOMEM;
1827
1828         trans = btrfs_start_transaction(root, 0);
1829         if (IS_ERR(trans)) {
1830                 btrfs_free_path(path);
1831                 return PTR_ERR(trans);
1832         }
1833         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1834         key.type = BTRFS_DEV_ITEM_KEY;
1835         key.offset = device->devid;
1836
1837         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1838         if (ret) {
1839                 if (ret > 0)
1840                         ret = -ENOENT;
1841                 btrfs_abort_transaction(trans, ret);
1842                 btrfs_end_transaction(trans);
1843                 goto out;
1844         }
1845
1846         ret = btrfs_del_item(trans, root, path);
1847         if (ret) {
1848                 btrfs_abort_transaction(trans, ret);
1849                 btrfs_end_transaction(trans);
1850         }
1851
1852 out:
1853         btrfs_free_path(path);
1854         if (!ret)
1855                 ret = btrfs_commit_transaction(trans);
1856         return ret;
1857 }
1858
1859 /*
1860  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1861  * filesystem. It's up to the caller to adjust that number regarding eg. device
1862  * replace.
1863  */
1864 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1865                 u64 num_devices)
1866 {
1867         u64 all_avail;
1868         unsigned seq;
1869         int i;
1870
1871         do {
1872                 seq = read_seqbegin(&fs_info->profiles_lock);
1873
1874                 all_avail = fs_info->avail_data_alloc_bits |
1875                             fs_info->avail_system_alloc_bits |
1876                             fs_info->avail_metadata_alloc_bits;
1877         } while (read_seqretry(&fs_info->profiles_lock, seq));
1878
1879         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1880                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1881                         continue;
1882
1883                 if (num_devices < btrfs_raid_array[i].devs_min) {
1884                         int ret = btrfs_raid_array[i].mindev_error;
1885
1886                         if (ret)
1887                                 return ret;
1888                 }
1889         }
1890
1891         return 0;
1892 }
1893
1894 static struct btrfs_device * btrfs_find_next_active_device(
1895                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1896 {
1897         struct btrfs_device *next_device;
1898
1899         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1900                 if (next_device != device &&
1901                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1902                     && next_device->bdev)
1903                         return next_device;
1904         }
1905
1906         return NULL;
1907 }
1908
1909 /*
1910  * Helper function to check if the given device is part of s_bdev / latest_bdev
1911  * and replace it with the provided or the next active device, in the context
1912  * where this function called, there should be always be another device (or
1913  * this_dev) which is active.
1914  */
1915 void btrfs_assign_next_active_device(struct btrfs_device *device,
1916                                      struct btrfs_device *this_dev)
1917 {
1918         struct btrfs_fs_info *fs_info = device->fs_info;
1919         struct btrfs_device *next_device;
1920
1921         if (this_dev)
1922                 next_device = this_dev;
1923         else
1924                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1925                                                                 device);
1926         ASSERT(next_device);
1927
1928         if (fs_info->sb->s_bdev &&
1929                         (fs_info->sb->s_bdev == device->bdev))
1930                 fs_info->sb->s_bdev = next_device->bdev;
1931
1932         if (fs_info->fs_devices->latest_bdev == device->bdev)
1933                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1934 }
1935
1936 /*
1937  * Return btrfs_fs_devices::num_devices excluding the device that's being
1938  * currently replaced.
1939  */
1940 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1941 {
1942         u64 num_devices = fs_info->fs_devices->num_devices;
1943
1944         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1945         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1946                 ASSERT(num_devices > 1);
1947                 num_devices--;
1948         }
1949         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1950
1951         return num_devices;
1952 }
1953
1954 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1955                 u64 devid)
1956 {
1957         struct btrfs_device *device;
1958         struct btrfs_fs_devices *cur_devices;
1959         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1960         u64 num_devices;
1961         int ret = 0;
1962
1963         mutex_lock(&uuid_mutex);
1964
1965         num_devices = btrfs_num_devices(fs_info);
1966
1967         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1968         if (ret)
1969                 goto out;
1970
1971         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
1972
1973         if (IS_ERR(device)) {
1974                 if (PTR_ERR(device) == -ENOENT &&
1975                     strcmp(device_path, "missing") == 0)
1976                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1977                 else
1978                         ret = PTR_ERR(device);
1979                 goto out;
1980         }
1981
1982         if (btrfs_pinned_by_swapfile(fs_info, device)) {
1983                 btrfs_warn_in_rcu(fs_info,
1984                   "cannot remove device %s (devid %llu) due to active swapfile",
1985                                   rcu_str_deref(device->name), device->devid);
1986                 ret = -ETXTBSY;
1987                 goto out;
1988         }
1989
1990         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1991                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1992                 goto out;
1993         }
1994
1995         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1996             fs_info->fs_devices->rw_devices == 1) {
1997                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1998                 goto out;
1999         }
2000
2001         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2002                 mutex_lock(&fs_info->chunk_mutex);
2003                 list_del_init(&device->dev_alloc_list);
2004                 device->fs_devices->rw_devices--;
2005                 mutex_unlock(&fs_info->chunk_mutex);
2006         }
2007
2008         mutex_unlock(&uuid_mutex);
2009         ret = btrfs_shrink_device(device, 0);
2010         mutex_lock(&uuid_mutex);
2011         if (ret)
2012                 goto error_undo;
2013
2014         /*
2015          * TODO: the superblock still includes this device in its num_devices
2016          * counter although write_all_supers() is not locked out. This
2017          * could give a filesystem state which requires a degraded mount.
2018          */
2019         ret = btrfs_rm_dev_item(fs_info, device);
2020         if (ret)
2021                 goto error_undo;
2022
2023         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2024         btrfs_scrub_cancel_dev(fs_info, device);
2025
2026         /*
2027          * the device list mutex makes sure that we don't change
2028          * the device list while someone else is writing out all
2029          * the device supers. Whoever is writing all supers, should
2030          * lock the device list mutex before getting the number of
2031          * devices in the super block (super_copy). Conversely,
2032          * whoever updates the number of devices in the super block
2033          * (super_copy) should hold the device list mutex.
2034          */
2035
2036         /*
2037          * In normal cases the cur_devices == fs_devices. But in case
2038          * of deleting a seed device, the cur_devices should point to
2039          * its own fs_devices listed under the fs_devices->seed.
2040          */
2041         cur_devices = device->fs_devices;
2042         mutex_lock(&fs_devices->device_list_mutex);
2043         list_del_rcu(&device->dev_list);
2044
2045         cur_devices->num_devices--;
2046         cur_devices->total_devices--;
2047         /* Update total_devices of the parent fs_devices if it's seed */
2048         if (cur_devices != fs_devices)
2049                 fs_devices->total_devices--;
2050
2051         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2052                 cur_devices->missing_devices--;
2053
2054         btrfs_assign_next_active_device(device, NULL);
2055
2056         if (device->bdev) {
2057                 cur_devices->open_devices--;
2058                 /* remove sysfs entry */
2059                 btrfs_sysfs_rm_device_link(fs_devices, device);
2060         }
2061
2062         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2063         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2064         mutex_unlock(&fs_devices->device_list_mutex);
2065
2066         /*
2067          * at this point, the device is zero sized and detached from
2068          * the devices list.  All that's left is to zero out the old
2069          * supers and free the device.
2070          */
2071         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2072                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2073
2074         btrfs_close_bdev(device);
2075         call_rcu(&device->rcu, free_device_rcu);
2076
2077         if (cur_devices->open_devices == 0) {
2078                 while (fs_devices) {
2079                         if (fs_devices->seed == cur_devices) {
2080                                 fs_devices->seed = cur_devices->seed;
2081                                 break;
2082                         }
2083                         fs_devices = fs_devices->seed;
2084                 }
2085                 cur_devices->seed = NULL;
2086                 close_fs_devices(cur_devices);
2087                 free_fs_devices(cur_devices);
2088         }
2089
2090 out:
2091         mutex_unlock(&uuid_mutex);
2092         return ret;
2093
2094 error_undo:
2095         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2096                 mutex_lock(&fs_info->chunk_mutex);
2097                 list_add(&device->dev_alloc_list,
2098                          &fs_devices->alloc_list);
2099                 device->fs_devices->rw_devices++;
2100                 mutex_unlock(&fs_info->chunk_mutex);
2101         }
2102         goto out;
2103 }
2104
2105 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2106 {
2107         struct btrfs_fs_devices *fs_devices;
2108
2109         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2110
2111         /*
2112          * in case of fs with no seed, srcdev->fs_devices will point
2113          * to fs_devices of fs_info. However when the dev being replaced is
2114          * a seed dev it will point to the seed's local fs_devices. In short
2115          * srcdev will have its correct fs_devices in both the cases.
2116          */
2117         fs_devices = srcdev->fs_devices;
2118
2119         list_del_rcu(&srcdev->dev_list);
2120         list_del(&srcdev->dev_alloc_list);
2121         fs_devices->num_devices--;
2122         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2123                 fs_devices->missing_devices--;
2124
2125         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2126                 fs_devices->rw_devices--;
2127
2128         if (srcdev->bdev)
2129                 fs_devices->open_devices--;
2130 }
2131
2132 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2133                                       struct btrfs_device *srcdev)
2134 {
2135         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2136
2137         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2138                 /* zero out the old super if it is writable */
2139                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2140         }
2141
2142         btrfs_close_bdev(srcdev);
2143         call_rcu(&srcdev->rcu, free_device_rcu);
2144
2145         /* if this is no devs we rather delete the fs_devices */
2146         if (!fs_devices->num_devices) {
2147                 struct btrfs_fs_devices *tmp_fs_devices;
2148
2149                 /*
2150                  * On a mounted FS, num_devices can't be zero unless it's a
2151                  * seed. In case of a seed device being replaced, the replace
2152                  * target added to the sprout FS, so there will be no more
2153                  * device left under the seed FS.
2154                  */
2155                 ASSERT(fs_devices->seeding);
2156
2157                 tmp_fs_devices = fs_info->fs_devices;
2158                 while (tmp_fs_devices) {
2159                         if (tmp_fs_devices->seed == fs_devices) {
2160                                 tmp_fs_devices->seed = fs_devices->seed;
2161                                 break;
2162                         }
2163                         tmp_fs_devices = tmp_fs_devices->seed;
2164                 }
2165                 fs_devices->seed = NULL;
2166                 close_fs_devices(fs_devices);
2167                 free_fs_devices(fs_devices);
2168         }
2169 }
2170
2171 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2172 {
2173         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2174
2175         WARN_ON(!tgtdev);
2176         mutex_lock(&fs_devices->device_list_mutex);
2177
2178         btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2179
2180         if (tgtdev->bdev)
2181                 fs_devices->open_devices--;
2182
2183         fs_devices->num_devices--;
2184
2185         btrfs_assign_next_active_device(tgtdev, NULL);
2186
2187         list_del_rcu(&tgtdev->dev_list);
2188
2189         mutex_unlock(&fs_devices->device_list_mutex);
2190
2191         /*
2192          * The update_dev_time() with in btrfs_scratch_superblocks()
2193          * may lead to a call to btrfs_show_devname() which will try
2194          * to hold device_list_mutex. And here this device
2195          * is already out of device list, so we don't have to hold
2196          * the device_list_mutex lock.
2197          */
2198         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2199
2200         btrfs_close_bdev(tgtdev);
2201         call_rcu(&tgtdev->rcu, free_device_rcu);
2202 }
2203
2204 static struct btrfs_device *btrfs_find_device_by_path(
2205                 struct btrfs_fs_info *fs_info, const char *device_path)
2206 {
2207         int ret = 0;
2208         struct btrfs_super_block *disk_super;
2209         u64 devid;
2210         u8 *dev_uuid;
2211         struct block_device *bdev;
2212         struct buffer_head *bh;
2213         struct btrfs_device *device;
2214
2215         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2216                                     fs_info->bdev_holder, 0, &bdev, &bh);
2217         if (ret)
2218                 return ERR_PTR(ret);
2219         disk_super = (struct btrfs_super_block *)bh->b_data;
2220         devid = btrfs_stack_device_id(&disk_super->dev_item);
2221         dev_uuid = disk_super->dev_item.uuid;
2222         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2223                 device = btrfs_find_device(fs_info, devid, dev_uuid,
2224                                 disk_super->metadata_uuid);
2225         else
2226                 device = btrfs_find_device(fs_info, devid,
2227                                 dev_uuid, disk_super->fsid);
2228
2229         brelse(bh);
2230         if (!device)
2231                 device = ERR_PTR(-ENOENT);
2232         blkdev_put(bdev, FMODE_READ);
2233         return device;
2234 }
2235
2236 static struct btrfs_device *btrfs_find_device_missing_or_by_path(
2237                 struct btrfs_fs_info *fs_info, const char *device_path)
2238 {
2239         struct btrfs_device *device = NULL;
2240         if (strcmp(device_path, "missing") == 0) {
2241                 struct list_head *devices;
2242                 struct btrfs_device *tmp;
2243
2244                 devices = &fs_info->fs_devices->devices;
2245                 list_for_each_entry(tmp, devices, dev_list) {
2246                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2247                                         &tmp->dev_state) && !tmp->bdev) {
2248                                 device = tmp;
2249                                 break;
2250                         }
2251                 }
2252
2253                 if (!device)
2254                         return ERR_PTR(-ENOENT);
2255         } else {
2256                 device = btrfs_find_device_by_path(fs_info, device_path);
2257         }
2258
2259         return device;
2260 }
2261
2262 /*
2263  * Lookup a device given by device id, or the path if the id is 0.
2264  */
2265 struct btrfs_device *btrfs_find_device_by_devspec(
2266                 struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
2267 {
2268         struct btrfs_device *device;
2269
2270         if (devid) {
2271                 device = btrfs_find_device(fs_info, devid, NULL, NULL);
2272                 if (!device)
2273                         return ERR_PTR(-ENOENT);
2274         } else {
2275                 if (!devpath || !devpath[0])
2276                         return ERR_PTR(-EINVAL);
2277                 device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
2278         }
2279         return device;
2280 }
2281
2282 /*
2283  * does all the dirty work required for changing file system's UUID.
2284  */
2285 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2286 {
2287         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2288         struct btrfs_fs_devices *old_devices;
2289         struct btrfs_fs_devices *seed_devices;
2290         struct btrfs_super_block *disk_super = fs_info->super_copy;
2291         struct btrfs_device *device;
2292         u64 super_flags;
2293
2294         lockdep_assert_held(&uuid_mutex);
2295         if (!fs_devices->seeding)
2296                 return -EINVAL;
2297
2298         seed_devices = alloc_fs_devices(NULL, NULL);
2299         if (IS_ERR(seed_devices))
2300                 return PTR_ERR(seed_devices);
2301
2302         old_devices = clone_fs_devices(fs_devices);
2303         if (IS_ERR(old_devices)) {
2304                 kfree(seed_devices);
2305                 return PTR_ERR(old_devices);
2306         }
2307
2308         list_add(&old_devices->fs_list, &fs_uuids);
2309
2310         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2311         seed_devices->opened = 1;
2312         INIT_LIST_HEAD(&seed_devices->devices);
2313         INIT_LIST_HEAD(&seed_devices->alloc_list);
2314         mutex_init(&seed_devices->device_list_mutex);
2315
2316         mutex_lock(&fs_devices->device_list_mutex);
2317         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2318                               synchronize_rcu);
2319         list_for_each_entry(device, &seed_devices->devices, dev_list)
2320                 device->fs_devices = seed_devices;
2321
2322         mutex_lock(&fs_info->chunk_mutex);
2323         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2324         mutex_unlock(&fs_info->chunk_mutex);
2325
2326         fs_devices->seeding = 0;
2327         fs_devices->num_devices = 0;
2328         fs_devices->open_devices = 0;
2329         fs_devices->missing_devices = 0;
2330         fs_devices->rotating = 0;
2331         fs_devices->seed = seed_devices;
2332
2333         generate_random_uuid(fs_devices->fsid);
2334         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2335         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2336         mutex_unlock(&fs_devices->device_list_mutex);
2337
2338         super_flags = btrfs_super_flags(disk_super) &
2339                       ~BTRFS_SUPER_FLAG_SEEDING;
2340         btrfs_set_super_flags(disk_super, super_flags);
2341
2342         return 0;
2343 }
2344
2345 /*
2346  * Store the expected generation for seed devices in device items.
2347  */
2348 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2349                                struct btrfs_fs_info *fs_info)
2350 {
2351         struct btrfs_root *root = fs_info->chunk_root;
2352         struct btrfs_path *path;
2353         struct extent_buffer *leaf;
2354         struct btrfs_dev_item *dev_item;
2355         struct btrfs_device *device;
2356         struct btrfs_key key;
2357         u8 fs_uuid[BTRFS_FSID_SIZE];
2358         u8 dev_uuid[BTRFS_UUID_SIZE];
2359         u64 devid;
2360         int ret;
2361
2362         path = btrfs_alloc_path();
2363         if (!path)
2364                 return -ENOMEM;
2365
2366         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2367         key.offset = 0;
2368         key.type = BTRFS_DEV_ITEM_KEY;
2369
2370         while (1) {
2371                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2372                 if (ret < 0)
2373                         goto error;
2374
2375                 leaf = path->nodes[0];
2376 next_slot:
2377                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2378                         ret = btrfs_next_leaf(root, path);
2379                         if (ret > 0)
2380                                 break;
2381                         if (ret < 0)
2382                                 goto error;
2383                         leaf = path->nodes[0];
2384                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2385                         btrfs_release_path(path);
2386                         continue;
2387                 }
2388
2389                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2390                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2391                     key.type != BTRFS_DEV_ITEM_KEY)
2392                         break;
2393
2394                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2395                                           struct btrfs_dev_item);
2396                 devid = btrfs_device_id(leaf, dev_item);
2397                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2398                                    BTRFS_UUID_SIZE);
2399                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2400                                    BTRFS_FSID_SIZE);
2401                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2402                 BUG_ON(!device); /* Logic error */
2403
2404                 if (device->fs_devices->seeding) {
2405                         btrfs_set_device_generation(leaf, dev_item,
2406                                                     device->generation);
2407                         btrfs_mark_buffer_dirty(leaf);
2408                 }
2409
2410                 path->slots[0]++;
2411                 goto next_slot;
2412         }
2413         ret = 0;
2414 error:
2415         btrfs_free_path(path);
2416         return ret;
2417 }
2418
2419 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2420 {
2421         struct btrfs_root *root = fs_info->dev_root;
2422         struct request_queue *q;
2423         struct btrfs_trans_handle *trans;
2424         struct btrfs_device *device;
2425         struct block_device *bdev;
2426         struct super_block *sb = fs_info->sb;
2427         struct rcu_string *name;
2428         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2429         u64 orig_super_total_bytes;
2430         u64 orig_super_num_devices;
2431         int seeding_dev = 0;
2432         int ret = 0;
2433         bool unlocked = false;
2434
2435         if (sb_rdonly(sb) && !fs_devices->seeding)
2436                 return -EROFS;
2437
2438         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2439                                   fs_info->bdev_holder);
2440         if (IS_ERR(bdev))
2441                 return PTR_ERR(bdev);
2442
2443         if (fs_devices->seeding) {
2444                 seeding_dev = 1;
2445                 down_write(&sb->s_umount);
2446                 mutex_lock(&uuid_mutex);
2447         }
2448
2449         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2450
2451         mutex_lock(&fs_devices->device_list_mutex);
2452         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2453                 if (device->bdev == bdev) {
2454                         ret = -EEXIST;
2455                         mutex_unlock(
2456                                 &fs_devices->device_list_mutex);
2457                         goto error;
2458                 }
2459         }
2460         mutex_unlock(&fs_devices->device_list_mutex);
2461
2462         device = btrfs_alloc_device(fs_info, NULL, NULL);
2463         if (IS_ERR(device)) {
2464                 /* we can safely leave the fs_devices entry around */
2465                 ret = PTR_ERR(device);
2466                 goto error;
2467         }
2468
2469         name = rcu_string_strdup(device_path, GFP_KERNEL);
2470         if (!name) {
2471                 ret = -ENOMEM;
2472                 goto error_free_device;
2473         }
2474         rcu_assign_pointer(device->name, name);
2475
2476         trans = btrfs_start_transaction(root, 0);
2477         if (IS_ERR(trans)) {
2478                 ret = PTR_ERR(trans);
2479                 goto error_free_device;
2480         }
2481
2482         q = bdev_get_queue(bdev);
2483         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2484         device->generation = trans->transid;
2485         device->io_width = fs_info->sectorsize;
2486         device->io_align = fs_info->sectorsize;
2487         device->sector_size = fs_info->sectorsize;
2488         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2489                                          fs_info->sectorsize);
2490         device->disk_total_bytes = device->total_bytes;
2491         device->commit_total_bytes = device->total_bytes;
2492         device->fs_info = fs_info;
2493         device->bdev = bdev;
2494         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2495         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2496         device->mode = FMODE_EXCL;
2497         device->dev_stats_valid = 1;
2498         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2499
2500         if (seeding_dev) {
2501                 sb->s_flags &= ~SB_RDONLY;
2502                 ret = btrfs_prepare_sprout(fs_info);
2503                 if (ret) {
2504                         btrfs_abort_transaction(trans, ret);
2505                         goto error_trans;
2506                 }
2507         }
2508
2509         device->fs_devices = fs_devices;
2510
2511         mutex_lock(&fs_devices->device_list_mutex);
2512         mutex_lock(&fs_info->chunk_mutex);
2513         list_add_rcu(&device->dev_list, &fs_devices->devices);
2514         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2515         fs_devices->num_devices++;
2516         fs_devices->open_devices++;
2517         fs_devices->rw_devices++;
2518         fs_devices->total_devices++;
2519         fs_devices->total_rw_bytes += device->total_bytes;
2520
2521         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2522
2523         if (!blk_queue_nonrot(q))
2524                 fs_devices->rotating = 1;
2525
2526         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2527         btrfs_set_super_total_bytes(fs_info->super_copy,
2528                 round_down(orig_super_total_bytes + device->total_bytes,
2529                            fs_info->sectorsize));
2530
2531         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2532         btrfs_set_super_num_devices(fs_info->super_copy,
2533                                     orig_super_num_devices + 1);
2534
2535         /* add sysfs device entry */
2536         btrfs_sysfs_add_device_link(fs_devices, device);
2537
2538         /*
2539          * we've got more storage, clear any full flags on the space
2540          * infos
2541          */
2542         btrfs_clear_space_info_full(fs_info);
2543
2544         mutex_unlock(&fs_info->chunk_mutex);
2545         mutex_unlock(&fs_devices->device_list_mutex);
2546
2547         if (seeding_dev) {
2548                 mutex_lock(&fs_info->chunk_mutex);
2549                 ret = init_first_rw_device(trans, fs_info);
2550                 mutex_unlock(&fs_info->chunk_mutex);
2551                 if (ret) {
2552                         btrfs_abort_transaction(trans, ret);
2553                         goto error_sysfs;
2554                 }
2555         }
2556
2557         ret = btrfs_add_dev_item(trans, device);
2558         if (ret) {
2559                 btrfs_abort_transaction(trans, ret);
2560                 goto error_sysfs;
2561         }
2562
2563         if (seeding_dev) {
2564                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2565
2566                 ret = btrfs_finish_sprout(trans, fs_info);
2567                 if (ret) {
2568                         btrfs_abort_transaction(trans, ret);
2569                         goto error_sysfs;
2570                 }
2571
2572                 /* Sprouting would change fsid of the mounted root,
2573                  * so rename the fsid on the sysfs
2574                  */
2575                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2576                                                 fs_info->fs_devices->fsid);
2577                 if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2578                         btrfs_warn(fs_info,
2579                                    "sysfs: failed to create fsid for sprout");
2580         }
2581
2582         ret = btrfs_commit_transaction(trans);
2583
2584         if (seeding_dev) {
2585                 mutex_unlock(&uuid_mutex);
2586                 up_write(&sb->s_umount);
2587                 unlocked = true;
2588
2589                 if (ret) /* transaction commit */
2590                         return ret;
2591
2592                 ret = btrfs_relocate_sys_chunks(fs_info);
2593                 if (ret < 0)
2594                         btrfs_handle_fs_error(fs_info, ret,
2595                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2596                 trans = btrfs_attach_transaction(root);
2597                 if (IS_ERR(trans)) {
2598                         if (PTR_ERR(trans) == -ENOENT)
2599                                 return 0;
2600                         ret = PTR_ERR(trans);
2601                         trans = NULL;
2602                         goto error_sysfs;
2603                 }
2604                 ret = btrfs_commit_transaction(trans);
2605         }
2606
2607         /* Update ctime/mtime for libblkid */
2608         update_dev_time(device_path);
2609         return ret;
2610
2611 error_sysfs:
2612         btrfs_sysfs_rm_device_link(fs_devices, device);
2613         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2614         mutex_lock(&fs_info->chunk_mutex);
2615         list_del_rcu(&device->dev_list);
2616         list_del(&device->dev_alloc_list);
2617         fs_info->fs_devices->num_devices--;
2618         fs_info->fs_devices->open_devices--;
2619         fs_info->fs_devices->rw_devices--;
2620         fs_info->fs_devices->total_devices--;
2621         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2622         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2623         btrfs_set_super_total_bytes(fs_info->super_copy,
2624                                     orig_super_total_bytes);
2625         btrfs_set_super_num_devices(fs_info->super_copy,
2626                                     orig_super_num_devices);
2627         mutex_unlock(&fs_info->chunk_mutex);
2628         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2629 error_trans:
2630         if (seeding_dev)
2631                 sb->s_flags |= SB_RDONLY;
2632         if (trans)
2633                 btrfs_end_transaction(trans);
2634 error_free_device:
2635         btrfs_free_device(device);
2636 error:
2637         blkdev_put(bdev, FMODE_EXCL);
2638         if (seeding_dev && !unlocked) {
2639                 mutex_unlock(&uuid_mutex);
2640                 up_write(&sb->s_umount);
2641         }
2642         return ret;
2643 }
2644
2645 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2646                                         struct btrfs_device *device)
2647 {
2648         int ret;
2649         struct btrfs_path *path;
2650         struct btrfs_root *root = device->fs_info->chunk_root;
2651         struct btrfs_dev_item *dev_item;
2652         struct extent_buffer *leaf;
2653         struct btrfs_key key;
2654
2655         path = btrfs_alloc_path();
2656         if (!path)
2657                 return -ENOMEM;
2658
2659         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2660         key.type = BTRFS_DEV_ITEM_KEY;
2661         key.offset = device->devid;
2662
2663         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2664         if (ret < 0)
2665                 goto out;
2666
2667         if (ret > 0) {
2668                 ret = -ENOENT;
2669                 goto out;
2670         }
2671
2672         leaf = path->nodes[0];
2673         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2674
2675         btrfs_set_device_id(leaf, dev_item, device->devid);
2676         btrfs_set_device_type(leaf, dev_item, device->type);
2677         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2678         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2679         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2680         btrfs_set_device_total_bytes(leaf, dev_item,
2681                                      btrfs_device_get_disk_total_bytes(device));
2682         btrfs_set_device_bytes_used(leaf, dev_item,
2683                                     btrfs_device_get_bytes_used(device));
2684         btrfs_mark_buffer_dirty(leaf);
2685
2686 out:
2687         btrfs_free_path(path);
2688         return ret;
2689 }
2690
2691 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2692                       struct btrfs_device *device, u64 new_size)
2693 {
2694         struct btrfs_fs_info *fs_info = device->fs_info;
2695         struct btrfs_super_block *super_copy = fs_info->super_copy;
2696         struct btrfs_fs_devices *fs_devices;
2697         u64 old_total;
2698         u64 diff;
2699
2700         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2701                 return -EACCES;
2702
2703         new_size = round_down(new_size, fs_info->sectorsize);
2704
2705         mutex_lock(&fs_info->chunk_mutex);
2706         old_total = btrfs_super_total_bytes(super_copy);
2707         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2708
2709         if (new_size <= device->total_bytes ||
2710             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2711                 mutex_unlock(&fs_info->chunk_mutex);
2712                 return -EINVAL;
2713         }
2714
2715         fs_devices = fs_info->fs_devices;
2716
2717         btrfs_set_super_total_bytes(super_copy,
2718                         round_down(old_total + diff, fs_info->sectorsize));
2719         device->fs_devices->total_rw_bytes += diff;
2720
2721         btrfs_device_set_total_bytes(device, new_size);
2722         btrfs_device_set_disk_total_bytes(device, new_size);
2723         btrfs_clear_space_info_full(device->fs_info);
2724         if (list_empty(&device->resized_list))
2725                 list_add_tail(&device->resized_list,
2726                               &fs_devices->resized_devices);
2727         mutex_unlock(&fs_info->chunk_mutex);
2728
2729         return btrfs_update_device(trans, device);
2730 }
2731
2732 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2733 {
2734         struct btrfs_fs_info *fs_info = trans->fs_info;
2735         struct btrfs_root *root = fs_info->chunk_root;
2736         int ret;
2737         struct btrfs_path *path;
2738         struct btrfs_key key;
2739
2740         path = btrfs_alloc_path();
2741         if (!path)
2742                 return -ENOMEM;
2743
2744         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2745         key.offset = chunk_offset;
2746         key.type = BTRFS_CHUNK_ITEM_KEY;
2747
2748         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2749         if (ret < 0)
2750                 goto out;
2751         else if (ret > 0) { /* Logic error or corruption */
2752                 btrfs_handle_fs_error(fs_info, -ENOENT,
2753                                       "Failed lookup while freeing chunk.");
2754                 ret = -ENOENT;
2755                 goto out;
2756         }
2757
2758         ret = btrfs_del_item(trans, root, path);
2759         if (ret < 0)
2760                 btrfs_handle_fs_error(fs_info, ret,
2761                                       "Failed to delete chunk item.");
2762 out:
2763         btrfs_free_path(path);
2764         return ret;
2765 }
2766
2767 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2768 {
2769         struct btrfs_super_block *super_copy = fs_info->super_copy;
2770         struct btrfs_disk_key *disk_key;
2771         struct btrfs_chunk *chunk;
2772         u8 *ptr;
2773         int ret = 0;
2774         u32 num_stripes;
2775         u32 array_size;
2776         u32 len = 0;
2777         u32 cur;
2778         struct btrfs_key key;
2779
2780         mutex_lock(&fs_info->chunk_mutex);
2781         array_size = btrfs_super_sys_array_size(super_copy);
2782
2783         ptr = super_copy->sys_chunk_array;
2784         cur = 0;
2785
2786         while (cur < array_size) {
2787                 disk_key = (struct btrfs_disk_key *)ptr;
2788                 btrfs_disk_key_to_cpu(&key, disk_key);
2789
2790                 len = sizeof(*disk_key);
2791
2792                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2793                         chunk = (struct btrfs_chunk *)(ptr + len);
2794                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2795                         len += btrfs_chunk_item_size(num_stripes);
2796                 } else {
2797                         ret = -EIO;
2798                         break;
2799                 }
2800                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2801                     key.offset == chunk_offset) {
2802                         memmove(ptr, ptr + len, array_size - (cur + len));
2803                         array_size -= len;
2804                         btrfs_set_super_sys_array_size(super_copy, array_size);
2805                 } else {
2806                         ptr += len;
2807                         cur += len;
2808                 }
2809         }
2810         mutex_unlock(&fs_info->chunk_mutex);
2811         return ret;
2812 }
2813
2814 /*
2815  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2816  * @logical: Logical block offset in bytes.
2817  * @length: Length of extent in bytes.
2818  *
2819  * Return: Chunk mapping or ERR_PTR.
2820  */
2821 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2822                                        u64 logical, u64 length)
2823 {
2824         struct extent_map_tree *em_tree;
2825         struct extent_map *em;
2826
2827         em_tree = &fs_info->mapping_tree.map_tree;
2828         read_lock(&em_tree->lock);
2829         em = lookup_extent_mapping(em_tree, logical, length);
2830         read_unlock(&em_tree->lock);
2831
2832         if (!em) {
2833                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2834                            logical, length);
2835                 return ERR_PTR(-EINVAL);
2836         }
2837
2838         if (em->start > logical || em->start + em->len < logical) {
2839                 btrfs_crit(fs_info,
2840                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2841                            logical, length, em->start, em->start + em->len);
2842                 free_extent_map(em);
2843                 return ERR_PTR(-EINVAL);
2844         }
2845
2846         /* callers are responsible for dropping em's ref. */
2847         return em;
2848 }
2849
2850 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2851 {
2852         struct btrfs_fs_info *fs_info = trans->fs_info;
2853         struct extent_map *em;
2854         struct map_lookup *map;
2855         u64 dev_extent_len = 0;
2856         int i, ret = 0;
2857         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2858
2859         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2860         if (IS_ERR(em)) {
2861                 /*
2862                  * This is a logic error, but we don't want to just rely on the
2863                  * user having built with ASSERT enabled, so if ASSERT doesn't
2864                  * do anything we still error out.
2865                  */
2866                 ASSERT(0);
2867                 return PTR_ERR(em);
2868         }
2869         map = em->map_lookup;
2870         mutex_lock(&fs_info->chunk_mutex);
2871         check_system_chunk(trans, map->type);
2872         mutex_unlock(&fs_info->chunk_mutex);
2873
2874         /*
2875          * Take the device list mutex to prevent races with the final phase of
2876          * a device replace operation that replaces the device object associated
2877          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2878          */
2879         mutex_lock(&fs_devices->device_list_mutex);
2880         for (i = 0; i < map->num_stripes; i++) {
2881                 struct btrfs_device *device = map->stripes[i].dev;
2882                 ret = btrfs_free_dev_extent(trans, device,
2883                                             map->stripes[i].physical,
2884                                             &dev_extent_len);
2885                 if (ret) {
2886                         mutex_unlock(&fs_devices->device_list_mutex);
2887                         btrfs_abort_transaction(trans, ret);
2888                         goto out;
2889                 }
2890
2891                 if (device->bytes_used > 0) {
2892                         mutex_lock(&fs_info->chunk_mutex);
2893                         btrfs_device_set_bytes_used(device,
2894                                         device->bytes_used - dev_extent_len);
2895                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2896                         btrfs_clear_space_info_full(fs_info);
2897                         mutex_unlock(&fs_info->chunk_mutex);
2898                 }
2899
2900                 ret = btrfs_update_device(trans, device);
2901                 if (ret) {
2902                         mutex_unlock(&fs_devices->device_list_mutex);
2903                         btrfs_abort_transaction(trans, ret);
2904                         goto out;
2905                 }
2906         }
2907         mutex_unlock(&fs_devices->device_list_mutex);
2908
2909         ret = btrfs_free_chunk(trans, chunk_offset);
2910         if (ret) {
2911                 btrfs_abort_transaction(trans, ret);
2912                 goto out;
2913         }
2914
2915         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2916
2917         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2918                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2919                 if (ret) {
2920                         btrfs_abort_transaction(trans, ret);
2921                         goto out;
2922                 }
2923         }
2924
2925         ret = btrfs_remove_block_group(trans, chunk_offset, em);
2926         if (ret) {
2927                 btrfs_abort_transaction(trans, ret);
2928                 goto out;
2929         }
2930
2931 out:
2932         /* once for us */
2933         free_extent_map(em);
2934         return ret;
2935 }
2936
2937 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2938 {
2939         struct btrfs_root *root = fs_info->chunk_root;
2940         struct btrfs_trans_handle *trans;
2941         int ret;
2942
2943         /*
2944          * Prevent races with automatic removal of unused block groups.
2945          * After we relocate and before we remove the chunk with offset
2946          * chunk_offset, automatic removal of the block group can kick in,
2947          * resulting in a failure when calling btrfs_remove_chunk() below.
2948          *
2949          * Make sure to acquire this mutex before doing a tree search (dev
2950          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2951          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2952          * we release the path used to search the chunk/dev tree and before
2953          * the current task acquires this mutex and calls us.
2954          */
2955         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2956
2957         ret = btrfs_can_relocate(fs_info, chunk_offset);
2958         if (ret)
2959                 return -ENOSPC;
2960
2961         /* step one, relocate all the extents inside this chunk */
2962         btrfs_scrub_pause(fs_info);
2963         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2964         btrfs_scrub_continue(fs_info);
2965         if (ret)
2966                 return ret;
2967
2968         /*
2969          * We add the kobjects here (and after forcing data chunk creation)
2970          * since relocation is the only place we'll create chunks of a new
2971          * type at runtime.  The only place where we'll remove the last
2972          * chunk of a type is the call immediately below this one.  Even
2973          * so, we're protected against races with the cleaner thread since
2974          * we're covered by the delete_unused_bgs_mutex.
2975          */
2976         btrfs_add_raid_kobjects(fs_info);
2977
2978         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2979                                                      chunk_offset);
2980         if (IS_ERR(trans)) {
2981                 ret = PTR_ERR(trans);
2982                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2983                 return ret;
2984         }
2985
2986         /*
2987          * step two, delete the device extents and the
2988          * chunk tree entries
2989          */
2990         ret = btrfs_remove_chunk(trans, chunk_offset);
2991         btrfs_end_transaction(trans);
2992         return ret;
2993 }
2994
2995 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2996 {
2997         struct btrfs_root *chunk_root = fs_info->chunk_root;
2998         struct btrfs_path *path;
2999         struct extent_buffer *leaf;
3000         struct btrfs_chunk *chunk;
3001         struct btrfs_key key;
3002         struct btrfs_key found_key;
3003         u64 chunk_type;
3004         bool retried = false;
3005         int failed = 0;
3006         int ret;
3007
3008         path = btrfs_alloc_path();
3009         if (!path)
3010                 return -ENOMEM;
3011
3012 again:
3013         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3014         key.offset = (u64)-1;
3015         key.type = BTRFS_CHUNK_ITEM_KEY;
3016
3017         while (1) {
3018                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3019                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3020                 if (ret < 0) {
3021                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3022                         goto error;
3023                 }
3024                 BUG_ON(ret == 0); /* Corruption */
3025
3026                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3027                                           key.type);
3028                 if (ret)
3029                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3030                 if (ret < 0)
3031                         goto error;
3032                 if (ret > 0)
3033                         break;
3034
3035                 leaf = path->nodes[0];
3036                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3037
3038                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3039                                        struct btrfs_chunk);
3040                 chunk_type = btrfs_chunk_type(leaf, chunk);
3041                 btrfs_release_path(path);
3042
3043                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3044                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3045                         if (ret == -ENOSPC)
3046                                 failed++;
3047                         else
3048                                 BUG_ON(ret);
3049                 }
3050                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3051
3052                 if (found_key.offset == 0)
3053                         break;
3054                 key.offset = found_key.offset - 1;
3055         }
3056         ret = 0;
3057         if (failed && !retried) {
3058                 failed = 0;
3059                 retried = true;
3060                 goto again;
3061         } else if (WARN_ON(failed && retried)) {
3062                 ret = -ENOSPC;
3063         }
3064 error:
3065         btrfs_free_path(path);
3066         return ret;
3067 }
3068
3069 /*
3070  * return 1 : allocate a data chunk successfully,
3071  * return <0: errors during allocating a data chunk,
3072  * return 0 : no need to allocate a data chunk.
3073  */
3074 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3075                                       u64 chunk_offset)
3076 {
3077         struct btrfs_block_group_cache *cache;
3078         u64 bytes_used;
3079         u64 chunk_type;
3080
3081         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3082         ASSERT(cache);
3083         chunk_type = cache->flags;
3084         btrfs_put_block_group(cache);
3085
3086         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3087                 spin_lock(&fs_info->data_sinfo->lock);
3088                 bytes_used = fs_info->data_sinfo->bytes_used;
3089                 spin_unlock(&fs_info->data_sinfo->lock);
3090
3091                 if (!bytes_used) {
3092                         struct btrfs_trans_handle *trans;
3093                         int ret;
3094
3095                         trans = btrfs_join_transaction(fs_info->tree_root);
3096                         if (IS_ERR(trans))
3097                                 return PTR_ERR(trans);
3098
3099                         ret = btrfs_force_chunk_alloc(trans,
3100                                                       BTRFS_BLOCK_GROUP_DATA);
3101                         btrfs_end_transaction(trans);
3102                         if (ret < 0)
3103                                 return ret;
3104
3105                         btrfs_add_raid_kobjects(fs_info);
3106
3107                         return 1;
3108                 }
3109         }
3110         return 0;
3111 }
3112
3113 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3114                                struct btrfs_balance_control *bctl)
3115 {
3116         struct btrfs_root *root = fs_info->tree_root;
3117         struct btrfs_trans_handle *trans;
3118         struct btrfs_balance_item *item;
3119         struct btrfs_disk_balance_args disk_bargs;
3120         struct btrfs_path *path;
3121         struct extent_buffer *leaf;
3122         struct btrfs_key key;
3123         int ret, err;
3124
3125         path = btrfs_alloc_path();
3126         if (!path)
3127                 return -ENOMEM;
3128
3129         trans = btrfs_start_transaction(root, 0);
3130         if (IS_ERR(trans)) {
3131                 btrfs_free_path(path);
3132                 return PTR_ERR(trans);
3133         }
3134
3135         key.objectid = BTRFS_BALANCE_OBJECTID;
3136         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3137         key.offset = 0;
3138
3139         ret = btrfs_insert_empty_item(trans, root, path, &key,
3140                                       sizeof(*item));
3141         if (ret)
3142                 goto out;
3143
3144         leaf = path->nodes[0];
3145         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3146
3147         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3148
3149         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3150         btrfs_set_balance_data(leaf, item, &disk_bargs);
3151         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3152         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3153         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3154         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3155
3156         btrfs_set_balance_flags(leaf, item, bctl->flags);
3157
3158         btrfs_mark_buffer_dirty(leaf);
3159 out:
3160         btrfs_free_path(path);
3161         err = btrfs_commit_transaction(trans);
3162         if (err && !ret)
3163                 ret = err;
3164         return ret;
3165 }
3166
3167 static int del_balance_item(struct btrfs_fs_info *fs_info)
3168 {
3169         struct btrfs_root *root = fs_info->tree_root;
3170         struct btrfs_trans_handle *trans;
3171         struct btrfs_path *path;
3172         struct btrfs_key key;
3173         int ret, err;
3174
3175         path = btrfs_alloc_path();
3176         if (!path)
3177                 return -ENOMEM;
3178
3179         trans = btrfs_start_transaction(root, 0);
3180         if (IS_ERR(trans)) {
3181                 btrfs_free_path(path);
3182                 return PTR_ERR(trans);
3183         }
3184
3185         key.objectid = BTRFS_BALANCE_OBJECTID;
3186         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3187         key.offset = 0;
3188
3189         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3190         if (ret < 0)
3191                 goto out;
3192         if (ret > 0) {
3193                 ret = -ENOENT;
3194                 goto out;
3195         }
3196
3197         ret = btrfs_del_item(trans, root, path);
3198 out:
3199         btrfs_free_path(path);
3200         err = btrfs_commit_transaction(trans);
3201         if (err && !ret)
3202                 ret = err;
3203         return ret;
3204 }
3205
3206 /*
3207  * This is a heuristic used to reduce the number of chunks balanced on
3208  * resume after balance was interrupted.
3209  */
3210 static void update_balance_args(struct btrfs_balance_control *bctl)
3211 {
3212         /*
3213          * Turn on soft mode for chunk types that were being converted.
3214          */
3215         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3216                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3217         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3218                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3219         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3220                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3221
3222         /*
3223          * Turn on usage filter if is not already used.  The idea is
3224          * that chunks that we have already balanced should be
3225          * reasonably full.  Don't do it for chunks that are being
3226          * converted - that will keep us from relocating unconverted
3227          * (albeit full) chunks.
3228          */
3229         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3230             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3231             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3232                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3233                 bctl->data.usage = 90;
3234         }
3235         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3236             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&