btrfs: close devices without offloading to a temporary list
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include "ctree.h"
18 #include "extent_map.h"
19 #include "disk-io.h"
20 #include "transaction.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "async-thread.h"
25 #include "check-integrity.h"
26 #include "rcu-string.h"
27 #include "math.h"
28 #include "dev-replace.h"
29 #include "sysfs.h"
30
31 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
32         [BTRFS_RAID_RAID10] = {
33                 .sub_stripes    = 2,
34                 .dev_stripes    = 1,
35                 .devs_max       = 0,    /* 0 == as many as possible */
36                 .devs_min       = 4,
37                 .tolerated_failures = 1,
38                 .devs_increment = 2,
39                 .ncopies        = 2,
40                 .raid_name      = "raid10",
41                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
42                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
43         },
44         [BTRFS_RAID_RAID1] = {
45                 .sub_stripes    = 1,
46                 .dev_stripes    = 1,
47                 .devs_max       = 2,
48                 .devs_min       = 2,
49                 .tolerated_failures = 1,
50                 .devs_increment = 2,
51                 .ncopies        = 2,
52                 .raid_name      = "raid1",
53                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
54                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
55         },
56         [BTRFS_RAID_DUP] = {
57                 .sub_stripes    = 1,
58                 .dev_stripes    = 2,
59                 .devs_max       = 1,
60                 .devs_min       = 1,
61                 .tolerated_failures = 0,
62                 .devs_increment = 1,
63                 .ncopies        = 2,
64                 .raid_name      = "dup",
65                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
66                 .mindev_error   = 0,
67         },
68         [BTRFS_RAID_RAID0] = {
69                 .sub_stripes    = 1,
70                 .dev_stripes    = 1,
71                 .devs_max       = 0,
72                 .devs_min       = 2,
73                 .tolerated_failures = 0,
74                 .devs_increment = 1,
75                 .ncopies        = 1,
76                 .raid_name      = "raid0",
77                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
78                 .mindev_error   = 0,
79         },
80         [BTRFS_RAID_SINGLE] = {
81                 .sub_stripes    = 1,
82                 .dev_stripes    = 1,
83                 .devs_max       = 1,
84                 .devs_min       = 1,
85                 .tolerated_failures = 0,
86                 .devs_increment = 1,
87                 .ncopies        = 1,
88                 .raid_name      = "single",
89                 .bg_flag        = 0,
90                 .mindev_error   = 0,
91         },
92         [BTRFS_RAID_RAID5] = {
93                 .sub_stripes    = 1,
94                 .dev_stripes    = 1,
95                 .devs_max       = 0,
96                 .devs_min       = 2,
97                 .tolerated_failures = 1,
98                 .devs_increment = 1,
99                 .ncopies        = 2,
100                 .raid_name      = "raid5",
101                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
102                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
103         },
104         [BTRFS_RAID_RAID6] = {
105                 .sub_stripes    = 1,
106                 .dev_stripes    = 1,
107                 .devs_max       = 0,
108                 .devs_min       = 3,
109                 .tolerated_failures = 2,
110                 .devs_increment = 1,
111                 .ncopies        = 3,
112                 .raid_name      = "raid6",
113                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
114                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
115         },
116 };
117
118 const char *get_raid_name(enum btrfs_raid_types type)
119 {
120         if (type >= BTRFS_NR_RAID_TYPES)
121                 return NULL;
122
123         return btrfs_raid_array[type].raid_name;
124 }
125
126 static int init_first_rw_device(struct btrfs_trans_handle *trans,
127                                 struct btrfs_fs_info *fs_info);
128 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
129 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
130 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
131 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
132 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
133                              enum btrfs_map_op op,
134                              u64 logical, u64 *length,
135                              struct btrfs_bio **bbio_ret,
136                              int mirror_num, int need_raid_map);
137
138 /*
139  * Device locking
140  * ==============
141  *
142  * There are several mutexes that protect manipulation of devices and low-level
143  * structures like chunks but not block groups, extents or files
144  *
145  * uuid_mutex (global lock)
146  * ------------------------
147  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
148  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
149  * device) or requested by the device= mount option
150  *
151  * the mutex can be very coarse and can cover long-running operations
152  *
153  * protects: updates to fs_devices counters like missing devices, rw devices,
154  * seeding, structure cloning, openning/closing devices at mount/umount time
155  *
156  * global::fs_devs - add, remove, updates to the global list
157  *
158  * does not protect: manipulation of the fs_devices::devices list!
159  *
160  * btrfs_device::name - renames (write side), read is RCU
161  *
162  * fs_devices::device_list_mutex (per-fs, with RCU)
163  * ------------------------------------------------
164  * protects updates to fs_devices::devices, ie. adding and deleting
165  *
166  * simple list traversal with read-only actions can be done with RCU protection
167  *
168  * may be used to exclude some operations from running concurrently without any
169  * modifications to the list (see write_all_supers)
170  *
171  * balance_mutex
172  * -------------
173  * protects balance structures (status, state) and context accessed from
174  * several places (internally, ioctl)
175  *
176  * chunk_mutex
177  * -----------
178  * protects chunks, adding or removing during allocation, trim or when a new
179  * device is added/removed
180  *
181  * cleaner_mutex
182  * -------------
183  * a big lock that is held by the cleaner thread and prevents running subvolume
184  * cleaning together with relocation or delayed iputs
185  *
186  *
187  * Lock nesting
188  * ============
189  *
190  * uuid_mutex
191  *   volume_mutex
192  *     device_list_mutex
193  *       chunk_mutex
194  *     balance_mutex
195  *
196  *
197  * Exclusive operations, BTRFS_FS_EXCL_OP
198  * ======================================
199  *
200  * Maintains the exclusivity of the following operations that apply to the
201  * whole filesystem and cannot run in parallel.
202  *
203  * - Balance (*)
204  * - Device add
205  * - Device remove
206  * - Device replace (*)
207  * - Resize
208  *
209  * The device operations (as above) can be in one of the following states:
210  *
211  * - Running state
212  * - Paused state
213  * - Completed state
214  *
215  * Only device operations marked with (*) can go into the Paused state for the
216  * following reasons:
217  *
218  * - ioctl (only Balance can be Paused through ioctl)
219  * - filesystem remounted as read-only
220  * - filesystem unmounted and mounted as read-only
221  * - system power-cycle and filesystem mounted as read-only
222  * - filesystem or device errors leading to forced read-only
223  *
224  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
225  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
226  * A device operation in Paused or Running state can be canceled or resumed
227  * either by ioctl (Balance only) or when remounted as read-write.
228  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
229  * completed.
230  */
231
232 DEFINE_MUTEX(uuid_mutex);
233 static LIST_HEAD(fs_uuids);
234 struct list_head *btrfs_get_fs_uuids(void)
235 {
236         return &fs_uuids;
237 }
238
239 /*
240  * alloc_fs_devices - allocate struct btrfs_fs_devices
241  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
242  *
243  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
244  * The returned struct is not linked onto any lists and can be destroyed with
245  * kfree() right away.
246  */
247 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
248 {
249         struct btrfs_fs_devices *fs_devs;
250
251         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
252         if (!fs_devs)
253                 return ERR_PTR(-ENOMEM);
254
255         mutex_init(&fs_devs->device_list_mutex);
256
257         INIT_LIST_HEAD(&fs_devs->devices);
258         INIT_LIST_HEAD(&fs_devs->resized_devices);
259         INIT_LIST_HEAD(&fs_devs->alloc_list);
260         INIT_LIST_HEAD(&fs_devs->fs_list);
261         if (fsid)
262                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
263
264         return fs_devs;
265 }
266
267 void btrfs_free_device(struct btrfs_device *device)
268 {
269         rcu_string_free(device->name);
270         bio_put(device->flush_bio);
271         kfree(device);
272 }
273
274 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
275 {
276         struct btrfs_device *device;
277         WARN_ON(fs_devices->opened);
278         while (!list_empty(&fs_devices->devices)) {
279                 device = list_entry(fs_devices->devices.next,
280                                     struct btrfs_device, dev_list);
281                 list_del(&device->dev_list);
282                 btrfs_free_device(device);
283         }
284         kfree(fs_devices);
285 }
286
287 static void btrfs_kobject_uevent(struct block_device *bdev,
288                                  enum kobject_action action)
289 {
290         int ret;
291
292         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
293         if (ret)
294                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
295                         action,
296                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
297                         &disk_to_dev(bdev->bd_disk)->kobj);
298 }
299
300 void __exit btrfs_cleanup_fs_uuids(void)
301 {
302         struct btrfs_fs_devices *fs_devices;
303
304         while (!list_empty(&fs_uuids)) {
305                 fs_devices = list_entry(fs_uuids.next,
306                                         struct btrfs_fs_devices, fs_list);
307                 list_del(&fs_devices->fs_list);
308                 free_fs_devices(fs_devices);
309         }
310 }
311
312 /*
313  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
314  * Returned struct is not linked onto any lists and must be destroyed using
315  * btrfs_free_device.
316  */
317 static struct btrfs_device *__alloc_device(void)
318 {
319         struct btrfs_device *dev;
320
321         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
322         if (!dev)
323                 return ERR_PTR(-ENOMEM);
324
325         /*
326          * Preallocate a bio that's always going to be used for flushing device
327          * barriers and matches the device lifespan
328          */
329         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
330         if (!dev->flush_bio) {
331                 kfree(dev);
332                 return ERR_PTR(-ENOMEM);
333         }
334
335         INIT_LIST_HEAD(&dev->dev_list);
336         INIT_LIST_HEAD(&dev->dev_alloc_list);
337         INIT_LIST_HEAD(&dev->resized_list);
338
339         spin_lock_init(&dev->io_lock);
340
341         atomic_set(&dev->reada_in_flight, 0);
342         atomic_set(&dev->dev_stats_ccnt, 0);
343         btrfs_device_data_ordered_init(dev);
344         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
345         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
346
347         return dev;
348 }
349
350 /*
351  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
352  * return NULL.
353  *
354  * If devid and uuid are both specified, the match must be exact, otherwise
355  * only devid is used.
356  */
357 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
358                 u64 devid, const u8 *uuid)
359 {
360         struct btrfs_device *dev;
361
362         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
363                 if (dev->devid == devid &&
364                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
365                         return dev;
366                 }
367         }
368         return NULL;
369 }
370
371 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
372 {
373         struct btrfs_fs_devices *fs_devices;
374
375         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
376                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
377                         return fs_devices;
378         }
379         return NULL;
380 }
381
382 static int
383 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
384                       int flush, struct block_device **bdev,
385                       struct buffer_head **bh)
386 {
387         int ret;
388
389         *bdev = blkdev_get_by_path(device_path, flags, holder);
390
391         if (IS_ERR(*bdev)) {
392                 ret = PTR_ERR(*bdev);
393                 goto error;
394         }
395
396         if (flush)
397                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
398         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
399         if (ret) {
400                 blkdev_put(*bdev, flags);
401                 goto error;
402         }
403         invalidate_bdev(*bdev);
404         *bh = btrfs_read_dev_super(*bdev);
405         if (IS_ERR(*bh)) {
406                 ret = PTR_ERR(*bh);
407                 blkdev_put(*bdev, flags);
408                 goto error;
409         }
410
411         return 0;
412
413 error:
414         *bdev = NULL;
415         *bh = NULL;
416         return ret;
417 }
418
419 static void requeue_list(struct btrfs_pending_bios *pending_bios,
420                         struct bio *head, struct bio *tail)
421 {
422
423         struct bio *old_head;
424
425         old_head = pending_bios->head;
426         pending_bios->head = head;
427         if (pending_bios->tail)
428                 tail->bi_next = old_head;
429         else
430                 pending_bios->tail = tail;
431 }
432
433 /*
434  * we try to collect pending bios for a device so we don't get a large
435  * number of procs sending bios down to the same device.  This greatly
436  * improves the schedulers ability to collect and merge the bios.
437  *
438  * But, it also turns into a long list of bios to process and that is sure
439  * to eventually make the worker thread block.  The solution here is to
440  * make some progress and then put this work struct back at the end of
441  * the list if the block device is congested.  This way, multiple devices
442  * can make progress from a single worker thread.
443  */
444 static noinline void run_scheduled_bios(struct btrfs_device *device)
445 {
446         struct btrfs_fs_info *fs_info = device->fs_info;
447         struct bio *pending;
448         struct backing_dev_info *bdi;
449         struct btrfs_pending_bios *pending_bios;
450         struct bio *tail;
451         struct bio *cur;
452         int again = 0;
453         unsigned long num_run;
454         unsigned long batch_run = 0;
455         unsigned long last_waited = 0;
456         int force_reg = 0;
457         int sync_pending = 0;
458         struct blk_plug plug;
459
460         /*
461          * this function runs all the bios we've collected for
462          * a particular device.  We don't want to wander off to
463          * another device without first sending all of these down.
464          * So, setup a plug here and finish it off before we return
465          */
466         blk_start_plug(&plug);
467
468         bdi = device->bdev->bd_bdi;
469
470 loop:
471         spin_lock(&device->io_lock);
472
473 loop_lock:
474         num_run = 0;
475
476         /* take all the bios off the list at once and process them
477          * later on (without the lock held).  But, remember the
478          * tail and other pointers so the bios can be properly reinserted
479          * into the list if we hit congestion
480          */
481         if (!force_reg && device->pending_sync_bios.head) {
482                 pending_bios = &device->pending_sync_bios;
483                 force_reg = 1;
484         } else {
485                 pending_bios = &device->pending_bios;
486                 force_reg = 0;
487         }
488
489         pending = pending_bios->head;
490         tail = pending_bios->tail;
491         WARN_ON(pending && !tail);
492
493         /*
494          * if pending was null this time around, no bios need processing
495          * at all and we can stop.  Otherwise it'll loop back up again
496          * and do an additional check so no bios are missed.
497          *
498          * device->running_pending is used to synchronize with the
499          * schedule_bio code.
500          */
501         if (device->pending_sync_bios.head == NULL &&
502             device->pending_bios.head == NULL) {
503                 again = 0;
504                 device->running_pending = 0;
505         } else {
506                 again = 1;
507                 device->running_pending = 1;
508         }
509
510         pending_bios->head = NULL;
511         pending_bios->tail = NULL;
512
513         spin_unlock(&device->io_lock);
514
515         while (pending) {
516
517                 rmb();
518                 /* we want to work on both lists, but do more bios on the
519                  * sync list than the regular list
520                  */
521                 if ((num_run > 32 &&
522                     pending_bios != &device->pending_sync_bios &&
523                     device->pending_sync_bios.head) ||
524                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
525                     device->pending_bios.head)) {
526                         spin_lock(&device->io_lock);
527                         requeue_list(pending_bios, pending, tail);
528                         goto loop_lock;
529                 }
530
531                 cur = pending;
532                 pending = pending->bi_next;
533                 cur->bi_next = NULL;
534
535                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
536
537                 /*
538                  * if we're doing the sync list, record that our
539                  * plug has some sync requests on it
540                  *
541                  * If we're doing the regular list and there are
542                  * sync requests sitting around, unplug before
543                  * we add more
544                  */
545                 if (pending_bios == &device->pending_sync_bios) {
546                         sync_pending = 1;
547                 } else if (sync_pending) {
548                         blk_finish_plug(&plug);
549                         blk_start_plug(&plug);
550                         sync_pending = 0;
551                 }
552
553                 btrfsic_submit_bio(cur);
554                 num_run++;
555                 batch_run++;
556
557                 cond_resched();
558
559                 /*
560                  * we made progress, there is more work to do and the bdi
561                  * is now congested.  Back off and let other work structs
562                  * run instead
563                  */
564                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
565                     fs_info->fs_devices->open_devices > 1) {
566                         struct io_context *ioc;
567
568                         ioc = current->io_context;
569
570                         /*
571                          * the main goal here is that we don't want to
572                          * block if we're going to be able to submit
573                          * more requests without blocking.
574                          *
575                          * This code does two great things, it pokes into
576                          * the elevator code from a filesystem _and_
577                          * it makes assumptions about how batching works.
578                          */
579                         if (ioc && ioc->nr_batch_requests > 0 &&
580                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
581                             (last_waited == 0 ||
582                              ioc->last_waited == last_waited)) {
583                                 /*
584                                  * we want to go through our batch of
585                                  * requests and stop.  So, we copy out
586                                  * the ioc->last_waited time and test
587                                  * against it before looping
588                                  */
589                                 last_waited = ioc->last_waited;
590                                 cond_resched();
591                                 continue;
592                         }
593                         spin_lock(&device->io_lock);
594                         requeue_list(pending_bios, pending, tail);
595                         device->running_pending = 1;
596
597                         spin_unlock(&device->io_lock);
598                         btrfs_queue_work(fs_info->submit_workers,
599                                          &device->work);
600                         goto done;
601                 }
602         }
603
604         cond_resched();
605         if (again)
606                 goto loop;
607
608         spin_lock(&device->io_lock);
609         if (device->pending_bios.head || device->pending_sync_bios.head)
610                 goto loop_lock;
611         spin_unlock(&device->io_lock);
612
613 done:
614         blk_finish_plug(&plug);
615 }
616
617 static void pending_bios_fn(struct btrfs_work *work)
618 {
619         struct btrfs_device *device;
620
621         device = container_of(work, struct btrfs_device, work);
622         run_scheduled_bios(device);
623 }
624
625 /*
626  *  Search and remove all stale (devices which are not mounted) devices.
627  *  When both inputs are NULL, it will search and release all stale devices.
628  *  path:       Optional. When provided will it release all unmounted devices
629  *              matching this path only.
630  *  skip_dev:   Optional. Will skip this device when searching for the stale
631  *              devices.
632  */
633 static void btrfs_free_stale_devices(const char *path,
634                                      struct btrfs_device *skip_dev)
635 {
636         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
637         struct btrfs_device *dev, *tmp_dev;
638
639         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
640
641                 if (fs_devs->opened)
642                         continue;
643
644                 list_for_each_entry_safe(dev, tmp_dev,
645                                          &fs_devs->devices, dev_list) {
646                         int not_found = 0;
647
648                         if (skip_dev && skip_dev == dev)
649                                 continue;
650                         if (path && !dev->name)
651                                 continue;
652
653                         rcu_read_lock();
654                         if (path)
655                                 not_found = strcmp(rcu_str_deref(dev->name),
656                                                    path);
657                         rcu_read_unlock();
658                         if (not_found)
659                                 continue;
660
661                         /* delete the stale device */
662                         if (fs_devs->num_devices == 1) {
663                                 btrfs_sysfs_remove_fsid(fs_devs);
664                                 list_del(&fs_devs->fs_list);
665                                 free_fs_devices(fs_devs);
666                                 break;
667                         } else {
668                                 fs_devs->num_devices--;
669                                 list_del(&dev->dev_list);
670                                 btrfs_free_device(dev);
671                         }
672                 }
673         }
674 }
675
676 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
677                         struct btrfs_device *device, fmode_t flags,
678                         void *holder)
679 {
680         struct request_queue *q;
681         struct block_device *bdev;
682         struct buffer_head *bh;
683         struct btrfs_super_block *disk_super;
684         u64 devid;
685         int ret;
686
687         if (device->bdev)
688                 return -EINVAL;
689         if (!device->name)
690                 return -EINVAL;
691
692         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
693                                     &bdev, &bh);
694         if (ret)
695                 return ret;
696
697         disk_super = (struct btrfs_super_block *)bh->b_data;
698         devid = btrfs_stack_device_id(&disk_super->dev_item);
699         if (devid != device->devid)
700                 goto error_brelse;
701
702         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
703                 goto error_brelse;
704
705         device->generation = btrfs_super_generation(disk_super);
706
707         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
708                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
709                 fs_devices->seeding = 1;
710         } else {
711                 if (bdev_read_only(bdev))
712                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
713                 else
714                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
715         }
716
717         q = bdev_get_queue(bdev);
718         if (!blk_queue_nonrot(q))
719                 fs_devices->rotating = 1;
720
721         device->bdev = bdev;
722         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
723         device->mode = flags;
724
725         fs_devices->open_devices++;
726         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
727             device->devid != BTRFS_DEV_REPLACE_DEVID) {
728                 fs_devices->rw_devices++;
729                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
730         }
731         brelse(bh);
732
733         return 0;
734
735 error_brelse:
736         brelse(bh);
737         blkdev_put(bdev, flags);
738
739         return -EINVAL;
740 }
741
742 /*
743  * Add new device to list of registered devices
744  *
745  * Returns:
746  * device pointer which was just added or updated when successful
747  * error pointer when failed
748  */
749 static noinline struct btrfs_device *device_list_add(const char *path,
750                            struct btrfs_super_block *disk_super)
751 {
752         struct btrfs_device *device;
753         struct btrfs_fs_devices *fs_devices;
754         struct rcu_string *name;
755         u64 found_transid = btrfs_super_generation(disk_super);
756         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
757
758         fs_devices = find_fsid(disk_super->fsid);
759         if (!fs_devices) {
760                 fs_devices = alloc_fs_devices(disk_super->fsid);
761                 if (IS_ERR(fs_devices))
762                         return ERR_CAST(fs_devices);
763
764                 list_add(&fs_devices->fs_list, &fs_uuids);
765
766                 device = NULL;
767         } else {
768                 device = find_device(fs_devices, devid,
769                                 disk_super->dev_item.uuid);
770         }
771
772         if (!device) {
773                 if (fs_devices->opened)
774                         return ERR_PTR(-EBUSY);
775
776                 device = btrfs_alloc_device(NULL, &devid,
777                                             disk_super->dev_item.uuid);
778                 if (IS_ERR(device)) {
779                         /* we can safely leave the fs_devices entry around */
780                         return device;
781                 }
782
783                 name = rcu_string_strdup(path, GFP_NOFS);
784                 if (!name) {
785                         btrfs_free_device(device);
786                         return ERR_PTR(-ENOMEM);
787                 }
788                 rcu_assign_pointer(device->name, name);
789
790                 mutex_lock(&fs_devices->device_list_mutex);
791                 list_add_rcu(&device->dev_list, &fs_devices->devices);
792                 fs_devices->num_devices++;
793                 mutex_unlock(&fs_devices->device_list_mutex);
794
795                 device->fs_devices = fs_devices;
796                 btrfs_free_stale_devices(path, device);
797
798                 if (disk_super->label[0])
799                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
800                                 disk_super->label, devid, found_transid, path);
801                 else
802                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
803                                 disk_super->fsid, devid, found_transid, path);
804
805         } else if (!device->name || strcmp(device->name->str, path)) {
806                 /*
807                  * When FS is already mounted.
808                  * 1. If you are here and if the device->name is NULL that
809                  *    means this device was missing at time of FS mount.
810                  * 2. If you are here and if the device->name is different
811                  *    from 'path' that means either
812                  *      a. The same device disappeared and reappeared with
813                  *         different name. or
814                  *      b. The missing-disk-which-was-replaced, has
815                  *         reappeared now.
816                  *
817                  * We must allow 1 and 2a above. But 2b would be a spurious
818                  * and unintentional.
819                  *
820                  * Further in case of 1 and 2a above, the disk at 'path'
821                  * would have missed some transaction when it was away and
822                  * in case of 2a the stale bdev has to be updated as well.
823                  * 2b must not be allowed at all time.
824                  */
825
826                 /*
827                  * For now, we do allow update to btrfs_fs_device through the
828                  * btrfs dev scan cli after FS has been mounted.  We're still
829                  * tracking a problem where systems fail mount by subvolume id
830                  * when we reject replacement on a mounted FS.
831                  */
832                 if (!fs_devices->opened && found_transid < device->generation) {
833                         /*
834                          * That is if the FS is _not_ mounted and if you
835                          * are here, that means there is more than one
836                          * disk with same uuid and devid.We keep the one
837                          * with larger generation number or the last-in if
838                          * generation are equal.
839                          */
840                         return ERR_PTR(-EEXIST);
841                 }
842
843                 name = rcu_string_strdup(path, GFP_NOFS);
844                 if (!name)
845                         return ERR_PTR(-ENOMEM);
846                 rcu_string_free(device->name);
847                 rcu_assign_pointer(device->name, name);
848                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
849                         fs_devices->missing_devices--;
850                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
851                 }
852         }
853
854         /*
855          * Unmount does not free the btrfs_device struct but would zero
856          * generation along with most of the other members. So just update
857          * it back. We need it to pick the disk with largest generation
858          * (as above).
859          */
860         if (!fs_devices->opened)
861                 device->generation = found_transid;
862
863         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
864
865         return device;
866 }
867
868 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
869 {
870         struct btrfs_fs_devices *fs_devices;
871         struct btrfs_device *device;
872         struct btrfs_device *orig_dev;
873
874         fs_devices = alloc_fs_devices(orig->fsid);
875         if (IS_ERR(fs_devices))
876                 return fs_devices;
877
878         mutex_lock(&orig->device_list_mutex);
879         fs_devices->total_devices = orig->total_devices;
880
881         /* We have held the volume lock, it is safe to get the devices. */
882         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
883                 struct rcu_string *name;
884
885                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
886                                             orig_dev->uuid);
887                 if (IS_ERR(device))
888                         goto error;
889
890                 /*
891                  * This is ok to do without rcu read locked because we hold the
892                  * uuid mutex so nothing we touch in here is going to disappear.
893                  */
894                 if (orig_dev->name) {
895                         name = rcu_string_strdup(orig_dev->name->str,
896                                         GFP_KERNEL);
897                         if (!name) {
898                                 btrfs_free_device(device);
899                                 goto error;
900                         }
901                         rcu_assign_pointer(device->name, name);
902                 }
903
904                 list_add(&device->dev_list, &fs_devices->devices);
905                 device->fs_devices = fs_devices;
906                 fs_devices->num_devices++;
907         }
908         mutex_unlock(&orig->device_list_mutex);
909         return fs_devices;
910 error:
911         mutex_unlock(&orig->device_list_mutex);
912         free_fs_devices(fs_devices);
913         return ERR_PTR(-ENOMEM);
914 }
915
916 /*
917  * After we have read the system tree and know devids belonging to
918  * this filesystem, remove the device which does not belong there.
919  */
920 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
921 {
922         struct btrfs_device *device, *next;
923         struct btrfs_device *latest_dev = NULL;
924
925         mutex_lock(&uuid_mutex);
926 again:
927         /* This is the initialized path, it is safe to release the devices. */
928         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
929                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
930                                                         &device->dev_state)) {
931                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
932                              &device->dev_state) &&
933                              (!latest_dev ||
934                               device->generation > latest_dev->generation)) {
935                                 latest_dev = device;
936                         }
937                         continue;
938                 }
939
940                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
941                         /*
942                          * In the first step, keep the device which has
943                          * the correct fsid and the devid that is used
944                          * for the dev_replace procedure.
945                          * In the second step, the dev_replace state is
946                          * read from the device tree and it is known
947                          * whether the procedure is really active or
948                          * not, which means whether this device is
949                          * used or whether it should be removed.
950                          */
951                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
952                                                   &device->dev_state)) {
953                                 continue;
954                         }
955                 }
956                 if (device->bdev) {
957                         blkdev_put(device->bdev, device->mode);
958                         device->bdev = NULL;
959                         fs_devices->open_devices--;
960                 }
961                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
962                         list_del_init(&device->dev_alloc_list);
963                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
964                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
965                                       &device->dev_state))
966                                 fs_devices->rw_devices--;
967                 }
968                 list_del_init(&device->dev_list);
969                 fs_devices->num_devices--;
970                 btrfs_free_device(device);
971         }
972
973         if (fs_devices->seed) {
974                 fs_devices = fs_devices->seed;
975                 goto again;
976         }
977
978         fs_devices->latest_bdev = latest_dev->bdev;
979
980         mutex_unlock(&uuid_mutex);
981 }
982
983 static void free_device_rcu(struct rcu_head *head)
984 {
985         struct btrfs_device *device;
986
987         device = container_of(head, struct btrfs_device, rcu);
988         btrfs_free_device(device);
989 }
990
991 static void btrfs_close_bdev(struct btrfs_device *device)
992 {
993         if (!device->bdev)
994                 return;
995
996         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
997                 sync_blockdev(device->bdev);
998                 invalidate_bdev(device->bdev);
999         }
1000
1001         blkdev_put(device->bdev, device->mode);
1002 }
1003
1004 static void btrfs_close_one_device(struct btrfs_device *device)
1005 {
1006         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1007         struct btrfs_device *new_device;
1008         struct rcu_string *name;
1009
1010         if (device->bdev)
1011                 fs_devices->open_devices--;
1012
1013         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1014             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1015                 list_del_init(&device->dev_alloc_list);
1016                 fs_devices->rw_devices--;
1017         }
1018
1019         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1020                 fs_devices->missing_devices--;
1021
1022         btrfs_close_bdev(device);
1023
1024         new_device = btrfs_alloc_device(NULL, &device->devid,
1025                                         device->uuid);
1026         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1027
1028         /* Safe because we are under uuid_mutex */
1029         if (device->name) {
1030                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1031                 BUG_ON(!name); /* -ENOMEM */
1032                 rcu_assign_pointer(new_device->name, name);
1033         }
1034
1035         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1036         new_device->fs_devices = device->fs_devices;
1037
1038         call_rcu(&device->rcu, free_device_rcu);
1039 }
1040
1041 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1042 {
1043         struct btrfs_device *device, *tmp;
1044
1045         if (--fs_devices->opened > 0)
1046                 return 0;
1047
1048         mutex_lock(&fs_devices->device_list_mutex);
1049         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1050                 btrfs_close_one_device(device);
1051         }
1052         mutex_unlock(&fs_devices->device_list_mutex);
1053
1054         WARN_ON(fs_devices->open_devices);
1055         WARN_ON(fs_devices->rw_devices);
1056         fs_devices->opened = 0;
1057         fs_devices->seeding = 0;
1058
1059         return 0;
1060 }
1061
1062 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1063 {
1064         struct btrfs_fs_devices *seed_devices = NULL;
1065         int ret;
1066
1067         mutex_lock(&uuid_mutex);
1068         ret = close_fs_devices(fs_devices);
1069         if (!fs_devices->opened) {
1070                 seed_devices = fs_devices->seed;
1071                 fs_devices->seed = NULL;
1072         }
1073         mutex_unlock(&uuid_mutex);
1074
1075         while (seed_devices) {
1076                 fs_devices = seed_devices;
1077                 seed_devices = fs_devices->seed;
1078                 close_fs_devices(fs_devices);
1079                 free_fs_devices(fs_devices);
1080         }
1081         return ret;
1082 }
1083
1084 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1085                                 fmode_t flags, void *holder)
1086 {
1087         struct btrfs_device *device;
1088         struct btrfs_device *latest_dev = NULL;
1089         int ret = 0;
1090
1091         flags |= FMODE_EXCL;
1092
1093         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1094                 /* Just open everything we can; ignore failures here */
1095                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1096                         continue;
1097
1098                 if (!latest_dev ||
1099                     device->generation > latest_dev->generation)
1100                         latest_dev = device;
1101         }
1102         if (fs_devices->open_devices == 0) {
1103                 ret = -EINVAL;
1104                 goto out;
1105         }
1106         fs_devices->opened = 1;
1107         fs_devices->latest_bdev = latest_dev->bdev;
1108         fs_devices->total_rw_bytes = 0;
1109 out:
1110         return ret;
1111 }
1112
1113 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1114 {
1115         struct btrfs_device *dev1, *dev2;
1116
1117         dev1 = list_entry(a, struct btrfs_device, dev_list);
1118         dev2 = list_entry(b, struct btrfs_device, dev_list);
1119
1120         if (dev1->devid < dev2->devid)
1121                 return -1;
1122         else if (dev1->devid > dev2->devid)
1123                 return 1;
1124         return 0;
1125 }
1126
1127 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1128                        fmode_t flags, void *holder)
1129 {
1130         int ret;
1131
1132         mutex_lock(&uuid_mutex);
1133         mutex_lock(&fs_devices->device_list_mutex);
1134         if (fs_devices->opened) {
1135                 fs_devices->opened++;
1136                 ret = 0;
1137         } else {
1138                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1139                 ret = open_fs_devices(fs_devices, flags, holder);
1140         }
1141         mutex_unlock(&fs_devices->device_list_mutex);
1142         mutex_unlock(&uuid_mutex);
1143
1144         return ret;
1145 }
1146
1147 static void btrfs_release_disk_super(struct page *page)
1148 {
1149         kunmap(page);
1150         put_page(page);
1151 }
1152
1153 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1154                                  struct page **page,
1155                                  struct btrfs_super_block **disk_super)
1156 {
1157         void *p;
1158         pgoff_t index;
1159
1160         /* make sure our super fits in the device */
1161         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1162                 return 1;
1163
1164         /* make sure our super fits in the page */
1165         if (sizeof(**disk_super) > PAGE_SIZE)
1166                 return 1;
1167
1168         /* make sure our super doesn't straddle pages on disk */
1169         index = bytenr >> PAGE_SHIFT;
1170         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1171                 return 1;
1172
1173         /* pull in the page with our super */
1174         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1175                                    index, GFP_KERNEL);
1176
1177         if (IS_ERR_OR_NULL(*page))
1178                 return 1;
1179
1180         p = kmap(*page);
1181
1182         /* align our pointer to the offset of the super block */
1183         *disk_super = p + (bytenr & ~PAGE_MASK);
1184
1185         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1186             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1187                 btrfs_release_disk_super(*page);
1188                 return 1;
1189         }
1190
1191         if ((*disk_super)->label[0] &&
1192                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1193                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1194
1195         return 0;
1196 }
1197
1198 /*
1199  * Look for a btrfs signature on a device. This may be called out of the mount path
1200  * and we are not allowed to call set_blocksize during the scan. The superblock
1201  * is read via pagecache
1202  */
1203 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1204                           struct btrfs_fs_devices **fs_devices_ret)
1205 {
1206         struct btrfs_super_block *disk_super;
1207         struct btrfs_device *device;
1208         struct block_device *bdev;
1209         struct page *page;
1210         int ret = 0;
1211         u64 bytenr;
1212
1213         /*
1214          * we would like to check all the supers, but that would make
1215          * a btrfs mount succeed after a mkfs from a different FS.
1216          * So, we need to add a special mount option to scan for
1217          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1218          */
1219         bytenr = btrfs_sb_offset(0);
1220         flags |= FMODE_EXCL;
1221
1222         bdev = blkdev_get_by_path(path, flags, holder);
1223         if (IS_ERR(bdev))
1224                 return PTR_ERR(bdev);
1225
1226         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1227                 ret = -EINVAL;
1228                 goto error_bdev_put;
1229         }
1230
1231         mutex_lock(&uuid_mutex);
1232         device = device_list_add(path, disk_super);
1233         if (IS_ERR(device))
1234                 ret = PTR_ERR(device);
1235         else
1236                 *fs_devices_ret = device->fs_devices;
1237         mutex_unlock(&uuid_mutex);
1238
1239         btrfs_release_disk_super(page);
1240
1241 error_bdev_put:
1242         blkdev_put(bdev, flags);
1243
1244         return ret;
1245 }
1246
1247 static int contains_pending_extent(struct btrfs_transaction *transaction,
1248                                    struct btrfs_device *device,
1249                                    u64 *start, u64 len)
1250 {
1251         struct btrfs_fs_info *fs_info = device->fs_info;
1252         struct extent_map *em;
1253         struct list_head *search_list = &fs_info->pinned_chunks;
1254         int ret = 0;
1255         u64 physical_start = *start;
1256
1257         if (transaction)
1258                 search_list = &transaction->pending_chunks;
1259 again:
1260         list_for_each_entry(em, search_list, list) {
1261                 struct map_lookup *map;
1262                 int i;
1263
1264                 map = em->map_lookup;
1265                 for (i = 0; i < map->num_stripes; i++) {
1266                         u64 end;
1267
1268                         if (map->stripes[i].dev != device)
1269                                 continue;
1270                         if (map->stripes[i].physical >= physical_start + len ||
1271                             map->stripes[i].physical + em->orig_block_len <=
1272                             physical_start)
1273                                 continue;
1274                         /*
1275                          * Make sure that while processing the pinned list we do
1276                          * not override our *start with a lower value, because
1277                          * we can have pinned chunks that fall within this
1278                          * device hole and that have lower physical addresses
1279                          * than the pending chunks we processed before. If we
1280                          * do not take this special care we can end up getting
1281                          * 2 pending chunks that start at the same physical
1282                          * device offsets because the end offset of a pinned
1283                          * chunk can be equal to the start offset of some
1284                          * pending chunk.
1285                          */
1286                         end = map->stripes[i].physical + em->orig_block_len;
1287                         if (end > *start) {
1288                                 *start = end;
1289                                 ret = 1;
1290                         }
1291                 }
1292         }
1293         if (search_list != &fs_info->pinned_chunks) {
1294                 search_list = &fs_info->pinned_chunks;
1295                 goto again;
1296         }
1297
1298         return ret;
1299 }
1300
1301
1302 /*
1303  * find_free_dev_extent_start - find free space in the specified device
1304  * @device:       the device which we search the free space in
1305  * @num_bytes:    the size of the free space that we need
1306  * @search_start: the position from which to begin the search
1307  * @start:        store the start of the free space.
1308  * @len:          the size of the free space. that we find, or the size
1309  *                of the max free space if we don't find suitable free space
1310  *
1311  * this uses a pretty simple search, the expectation is that it is
1312  * called very infrequently and that a given device has a small number
1313  * of extents
1314  *
1315  * @start is used to store the start of the free space if we find. But if we
1316  * don't find suitable free space, it will be used to store the start position
1317  * of the max free space.
1318  *
1319  * @len is used to store the size of the free space that we find.
1320  * But if we don't find suitable free space, it is used to store the size of
1321  * the max free space.
1322  */
1323 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1324                                struct btrfs_device *device, u64 num_bytes,
1325                                u64 search_start, u64 *start, u64 *len)
1326 {
1327         struct btrfs_fs_info *fs_info = device->fs_info;
1328         struct btrfs_root *root = fs_info->dev_root;
1329         struct btrfs_key key;
1330         struct btrfs_dev_extent *dev_extent;
1331         struct btrfs_path *path;
1332         u64 hole_size;
1333         u64 max_hole_start;
1334         u64 max_hole_size;
1335         u64 extent_end;
1336         u64 search_end = device->total_bytes;
1337         int ret;
1338         int slot;
1339         struct extent_buffer *l;
1340
1341         /*
1342          * We don't want to overwrite the superblock on the drive nor any area
1343          * used by the boot loader (grub for example), so we make sure to start
1344          * at an offset of at least 1MB.
1345          */
1346         search_start = max_t(u64, search_start, SZ_1M);
1347
1348         path = btrfs_alloc_path();
1349         if (!path)
1350                 return -ENOMEM;
1351
1352         max_hole_start = search_start;
1353         max_hole_size = 0;
1354
1355 again:
1356         if (search_start >= search_end ||
1357                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1358                 ret = -ENOSPC;
1359                 goto out;
1360         }
1361
1362         path->reada = READA_FORWARD;
1363         path->search_commit_root = 1;
1364         path->skip_locking = 1;
1365
1366         key.objectid = device->devid;
1367         key.offset = search_start;
1368         key.type = BTRFS_DEV_EXTENT_KEY;
1369
1370         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1371         if (ret < 0)
1372                 goto out;
1373         if (ret > 0) {
1374                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1375                 if (ret < 0)
1376                         goto out;
1377         }
1378
1379         while (1) {
1380                 l = path->nodes[0];
1381                 slot = path->slots[0];
1382                 if (slot >= btrfs_header_nritems(l)) {
1383                         ret = btrfs_next_leaf(root, path);
1384                         if (ret == 0)
1385                                 continue;
1386                         if (ret < 0)
1387                                 goto out;
1388
1389                         break;
1390                 }
1391                 btrfs_item_key_to_cpu(l, &key, slot);
1392
1393                 if (key.objectid < device->devid)
1394                         goto next;
1395
1396                 if (key.objectid > device->devid)
1397                         break;
1398
1399                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1400                         goto next;
1401
1402                 if (key.offset > search_start) {
1403                         hole_size = key.offset - search_start;
1404
1405                         /*
1406                          * Have to check before we set max_hole_start, otherwise
1407                          * we could end up sending back this offset anyway.
1408                          */
1409                         if (contains_pending_extent(transaction, device,
1410                                                     &search_start,
1411                                                     hole_size)) {
1412                                 if (key.offset >= search_start) {
1413                                         hole_size = key.offset - search_start;
1414                                 } else {
1415                                         WARN_ON_ONCE(1);
1416                                         hole_size = 0;
1417                                 }
1418                         }
1419
1420                         if (hole_size > max_hole_size) {
1421                                 max_hole_start = search_start;
1422                                 max_hole_size = hole_size;
1423                         }
1424
1425                         /*
1426                          * If this free space is greater than which we need,
1427                          * it must be the max free space that we have found
1428                          * until now, so max_hole_start must point to the start
1429                          * of this free space and the length of this free space
1430                          * is stored in max_hole_size. Thus, we return
1431                          * max_hole_start and max_hole_size and go back to the
1432                          * caller.
1433                          */
1434                         if (hole_size >= num_bytes) {
1435                                 ret = 0;
1436                                 goto out;
1437                         }
1438                 }
1439
1440                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1441                 extent_end = key.offset + btrfs_dev_extent_length(l,
1442                                                                   dev_extent);
1443                 if (extent_end > search_start)
1444                         search_start = extent_end;
1445 next:
1446                 path->slots[0]++;
1447                 cond_resched();
1448         }
1449
1450         /*
1451          * At this point, search_start should be the end of
1452          * allocated dev extents, and when shrinking the device,
1453          * search_end may be smaller than search_start.
1454          */
1455         if (search_end > search_start) {
1456                 hole_size = search_end - search_start;
1457
1458                 if (contains_pending_extent(transaction, device, &search_start,
1459                                             hole_size)) {
1460                         btrfs_release_path(path);
1461                         goto again;
1462                 }
1463
1464                 if (hole_size > max_hole_size) {
1465                         max_hole_start = search_start;
1466                         max_hole_size = hole_size;
1467                 }
1468         }
1469
1470         /* See above. */
1471         if (max_hole_size < num_bytes)
1472                 ret = -ENOSPC;
1473         else
1474                 ret = 0;
1475
1476 out:
1477         btrfs_free_path(path);
1478         *start = max_hole_start;
1479         if (len)
1480                 *len = max_hole_size;
1481         return ret;
1482 }
1483
1484 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1485                          struct btrfs_device *device, u64 num_bytes,
1486                          u64 *start, u64 *len)
1487 {
1488         /* FIXME use last free of some kind */
1489         return find_free_dev_extent_start(trans->transaction, device,
1490                                           num_bytes, 0, start, len);
1491 }
1492
1493 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1494                           struct btrfs_device *device,
1495                           u64 start, u64 *dev_extent_len)
1496 {
1497         struct btrfs_fs_info *fs_info = device->fs_info;
1498         struct btrfs_root *root = fs_info->dev_root;
1499         int ret;
1500         struct btrfs_path *path;
1501         struct btrfs_key key;
1502         struct btrfs_key found_key;
1503         struct extent_buffer *leaf = NULL;
1504         struct btrfs_dev_extent *extent = NULL;
1505
1506         path = btrfs_alloc_path();
1507         if (!path)
1508                 return -ENOMEM;
1509
1510         key.objectid = device->devid;
1511         key.offset = start;
1512         key.type = BTRFS_DEV_EXTENT_KEY;
1513 again:
1514         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1515         if (ret > 0) {
1516                 ret = btrfs_previous_item(root, path, key.objectid,
1517                                           BTRFS_DEV_EXTENT_KEY);
1518                 if (ret)
1519                         goto out;
1520                 leaf = path->nodes[0];
1521                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1522                 extent = btrfs_item_ptr(leaf, path->slots[0],
1523                                         struct btrfs_dev_extent);
1524                 BUG_ON(found_key.offset > start || found_key.offset +
1525                        btrfs_dev_extent_length(leaf, extent) < start);
1526                 key = found_key;
1527                 btrfs_release_path(path);
1528                 goto again;
1529         } else if (ret == 0) {
1530                 leaf = path->nodes[0];
1531                 extent = btrfs_item_ptr(leaf, path->slots[0],
1532                                         struct btrfs_dev_extent);
1533         } else {
1534                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1535                 goto out;
1536         }
1537
1538         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1539
1540         ret = btrfs_del_item(trans, root, path);
1541         if (ret) {
1542                 btrfs_handle_fs_error(fs_info, ret,
1543                                       "Failed to remove dev extent item");
1544         } else {
1545                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1546         }
1547 out:
1548         btrfs_free_path(path);
1549         return ret;
1550 }
1551
1552 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1553                                   struct btrfs_device *device,
1554                                   u64 chunk_offset, u64 start, u64 num_bytes)
1555 {
1556         int ret;
1557         struct btrfs_path *path;
1558         struct btrfs_fs_info *fs_info = device->fs_info;
1559         struct btrfs_root *root = fs_info->dev_root;
1560         struct btrfs_dev_extent *extent;
1561         struct extent_buffer *leaf;
1562         struct btrfs_key key;
1563
1564         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1565         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1566         path = btrfs_alloc_path();
1567         if (!path)
1568                 return -ENOMEM;
1569
1570         key.objectid = device->devid;
1571         key.offset = start;
1572         key.type = BTRFS_DEV_EXTENT_KEY;
1573         ret = btrfs_insert_empty_item(trans, root, path, &key,
1574                                       sizeof(*extent));
1575         if (ret)
1576                 goto out;
1577
1578         leaf = path->nodes[0];
1579         extent = btrfs_item_ptr(leaf, path->slots[0],
1580                                 struct btrfs_dev_extent);
1581         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1582                                         BTRFS_CHUNK_TREE_OBJECTID);
1583         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1584                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1585         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1586
1587         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1588         btrfs_mark_buffer_dirty(leaf);
1589 out:
1590         btrfs_free_path(path);
1591         return ret;
1592 }
1593
1594 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1595 {
1596         struct extent_map_tree *em_tree;
1597         struct extent_map *em;
1598         struct rb_node *n;
1599         u64 ret = 0;
1600
1601         em_tree = &fs_info->mapping_tree.map_tree;
1602         read_lock(&em_tree->lock);
1603         n = rb_last(&em_tree->map);
1604         if (n) {
1605                 em = rb_entry(n, struct extent_map, rb_node);
1606                 ret = em->start + em->len;
1607         }
1608         read_unlock(&em_tree->lock);
1609
1610         return ret;
1611 }
1612
1613 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1614                                     u64 *devid_ret)
1615 {
1616         int ret;
1617         struct btrfs_key key;
1618         struct btrfs_key found_key;
1619         struct btrfs_path *path;
1620
1621         path = btrfs_alloc_path();
1622         if (!path)
1623                 return -ENOMEM;
1624
1625         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1626         key.type = BTRFS_DEV_ITEM_KEY;
1627         key.offset = (u64)-1;
1628
1629         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1630         if (ret < 0)
1631                 goto error;
1632
1633         BUG_ON(ret == 0); /* Corruption */
1634
1635         ret = btrfs_previous_item(fs_info->chunk_root, path,
1636                                   BTRFS_DEV_ITEMS_OBJECTID,
1637                                   BTRFS_DEV_ITEM_KEY);
1638         if (ret) {
1639                 *devid_ret = 1;
1640         } else {
1641                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1642                                       path->slots[0]);
1643                 *devid_ret = found_key.offset + 1;
1644         }
1645         ret = 0;
1646 error:
1647         btrfs_free_path(path);
1648         return ret;
1649 }
1650
1651 /*
1652  * the device information is stored in the chunk root
1653  * the btrfs_device struct should be fully filled in
1654  */
1655 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1656                             struct btrfs_fs_info *fs_info,
1657                             struct btrfs_device *device)
1658 {
1659         struct btrfs_root *root = fs_info->chunk_root;
1660         int ret;
1661         struct btrfs_path *path;
1662         struct btrfs_dev_item *dev_item;
1663         struct extent_buffer *leaf;
1664         struct btrfs_key key;
1665         unsigned long ptr;
1666
1667         path = btrfs_alloc_path();
1668         if (!path)
1669                 return -ENOMEM;
1670
1671         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1672         key.type = BTRFS_DEV_ITEM_KEY;
1673         key.offset = device->devid;
1674
1675         ret = btrfs_insert_empty_item(trans, root, path, &key,
1676                                       sizeof(*dev_item));
1677         if (ret)
1678                 goto out;
1679
1680         leaf = path->nodes[0];
1681         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1682
1683         btrfs_set_device_id(leaf, dev_item, device->devid);
1684         btrfs_set_device_generation(leaf, dev_item, 0);
1685         btrfs_set_device_type(leaf, dev_item, device->type);
1686         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1687         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1688         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1689         btrfs_set_device_total_bytes(leaf, dev_item,
1690                                      btrfs_device_get_disk_total_bytes(device));
1691         btrfs_set_device_bytes_used(leaf, dev_item,
1692                                     btrfs_device_get_bytes_used(device));
1693         btrfs_set_device_group(leaf, dev_item, 0);
1694         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1695         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1696         btrfs_set_device_start_offset(leaf, dev_item, 0);
1697
1698         ptr = btrfs_device_uuid(dev_item);
1699         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1700         ptr = btrfs_device_fsid(dev_item);
1701         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1702         btrfs_mark_buffer_dirty(leaf);
1703
1704         ret = 0;
1705 out:
1706         btrfs_free_path(path);
1707         return ret;
1708 }
1709
1710 /*
1711  * Function to update ctime/mtime for a given device path.
1712  * Mainly used for ctime/mtime based probe like libblkid.
1713  */
1714 static void update_dev_time(const char *path_name)
1715 {
1716         struct file *filp;
1717
1718         filp = filp_open(path_name, O_RDWR, 0);
1719         if (IS_ERR(filp))
1720                 return;
1721         file_update_time(filp);
1722         filp_close(filp, NULL);
1723 }
1724
1725 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1726                              struct btrfs_device *device)
1727 {
1728         struct btrfs_root *root = fs_info->chunk_root;
1729         int ret;
1730         struct btrfs_path *path;
1731         struct btrfs_key key;
1732         struct btrfs_trans_handle *trans;
1733
1734         path = btrfs_alloc_path();
1735         if (!path)
1736                 return -ENOMEM;
1737
1738         trans = btrfs_start_transaction(root, 0);
1739         if (IS_ERR(trans)) {
1740                 btrfs_free_path(path);
1741                 return PTR_ERR(trans);
1742         }
1743         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1744         key.type = BTRFS_DEV_ITEM_KEY;
1745         key.offset = device->devid;
1746
1747         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1748         if (ret) {
1749                 if (ret > 0)
1750                         ret = -ENOENT;
1751                 btrfs_abort_transaction(trans, ret);
1752                 btrfs_end_transaction(trans);
1753                 goto out;
1754         }
1755
1756         ret = btrfs_del_item(trans, root, path);
1757         if (ret) {
1758                 btrfs_abort_transaction(trans, ret);
1759                 btrfs_end_transaction(trans);
1760         }
1761
1762 out:
1763         btrfs_free_path(path);
1764         if (!ret)
1765                 ret = btrfs_commit_transaction(trans);
1766         return ret;
1767 }
1768
1769 /*
1770  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1771  * filesystem. It's up to the caller to adjust that number regarding eg. device
1772  * replace.
1773  */
1774 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1775                 u64 num_devices)
1776 {
1777         u64 all_avail;
1778         unsigned seq;
1779         int i;
1780
1781         do {
1782                 seq = read_seqbegin(&fs_info->profiles_lock);
1783
1784                 all_avail = fs_info->avail_data_alloc_bits |
1785                             fs_info->avail_system_alloc_bits |
1786                             fs_info->avail_metadata_alloc_bits;
1787         } while (read_seqretry(&fs_info->profiles_lock, seq));
1788
1789         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1790                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1791                         continue;
1792
1793                 if (num_devices < btrfs_raid_array[i].devs_min) {
1794                         int ret = btrfs_raid_array[i].mindev_error;
1795
1796                         if (ret)
1797                                 return ret;
1798                 }
1799         }
1800
1801         return 0;
1802 }
1803
1804 static struct btrfs_device * btrfs_find_next_active_device(
1805                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1806 {
1807         struct btrfs_device *next_device;
1808
1809         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1810                 if (next_device != device &&
1811                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1812                     && next_device->bdev)
1813                         return next_device;
1814         }
1815
1816         return NULL;
1817 }
1818
1819 /*
1820  * Helper function to check if the given device is part of s_bdev / latest_bdev
1821  * and replace it with the provided or the next active device, in the context
1822  * where this function called, there should be always be another device (or
1823  * this_dev) which is active.
1824  */
1825 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1826                 struct btrfs_device *device, struct btrfs_device *this_dev)
1827 {
1828         struct btrfs_device *next_device;
1829
1830         if (this_dev)
1831                 next_device = this_dev;
1832         else
1833                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1834                                                                 device);
1835         ASSERT(next_device);
1836
1837         if (fs_info->sb->s_bdev &&
1838                         (fs_info->sb->s_bdev == device->bdev))
1839                 fs_info->sb->s_bdev = next_device->bdev;
1840
1841         if (fs_info->fs_devices->latest_bdev == device->bdev)
1842                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1843 }
1844
1845 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1846                 u64 devid)
1847 {
1848         struct btrfs_device *device;
1849         struct btrfs_fs_devices *cur_devices;
1850         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1851         u64 num_devices;
1852         int ret = 0;
1853
1854         mutex_lock(&uuid_mutex);
1855
1856         num_devices = fs_devices->num_devices;
1857         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1858         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1859                 WARN_ON(num_devices < 1);
1860                 num_devices--;
1861         }
1862         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1863
1864         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1865         if (ret)
1866                 goto out;
1867
1868         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1869                                            &device);
1870         if (ret)
1871                 goto out;
1872
1873         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1874                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1875                 goto out;
1876         }
1877
1878         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1879             fs_info->fs_devices->rw_devices == 1) {
1880                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1881                 goto out;
1882         }
1883
1884         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1885                 mutex_lock(&fs_info->chunk_mutex);
1886                 list_del_init(&device->dev_alloc_list);
1887                 device->fs_devices->rw_devices--;
1888                 mutex_unlock(&fs_info->chunk_mutex);
1889         }
1890
1891         mutex_unlock(&uuid_mutex);
1892         ret = btrfs_shrink_device(device, 0);
1893         mutex_lock(&uuid_mutex);
1894         if (ret)
1895                 goto error_undo;
1896
1897         /*
1898          * TODO: the superblock still includes this device in its num_devices
1899          * counter although write_all_supers() is not locked out. This
1900          * could give a filesystem state which requires a degraded mount.
1901          */
1902         ret = btrfs_rm_dev_item(fs_info, device);
1903         if (ret)
1904                 goto error_undo;
1905
1906         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1907         btrfs_scrub_cancel_dev(fs_info, device);
1908
1909         /*
1910          * the device list mutex makes sure that we don't change
1911          * the device list while someone else is writing out all
1912          * the device supers. Whoever is writing all supers, should
1913          * lock the device list mutex before getting the number of
1914          * devices in the super block (super_copy). Conversely,
1915          * whoever updates the number of devices in the super block
1916          * (super_copy) should hold the device list mutex.
1917          */
1918
1919         /*
1920          * In normal cases the cur_devices == fs_devices. But in case
1921          * of deleting a seed device, the cur_devices should point to
1922          * its own fs_devices listed under the fs_devices->seed.
1923          */
1924         cur_devices = device->fs_devices;
1925         mutex_lock(&fs_devices->device_list_mutex);
1926         list_del_rcu(&device->dev_list);
1927
1928         cur_devices->num_devices--;
1929         cur_devices->total_devices--;
1930         /* Update total_devices of the parent fs_devices if it's seed */
1931         if (cur_devices != fs_devices)
1932                 fs_devices->total_devices--;
1933
1934         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1935                 cur_devices->missing_devices--;
1936
1937         btrfs_assign_next_active_device(fs_info, device, NULL);
1938
1939         if (device->bdev) {
1940                 cur_devices->open_devices--;
1941                 /* remove sysfs entry */
1942                 btrfs_sysfs_rm_device_link(fs_devices, device);
1943         }
1944
1945         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
1946         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
1947         mutex_unlock(&fs_devices->device_list_mutex);
1948
1949         /*
1950          * at this point, the device is zero sized and detached from
1951          * the devices list.  All that's left is to zero out the old
1952          * supers and free the device.
1953          */
1954         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1955                 btrfs_scratch_superblocks(device->bdev, device->name->str);
1956
1957         btrfs_close_bdev(device);
1958         call_rcu(&device->rcu, free_device_rcu);
1959
1960         if (cur_devices->open_devices == 0) {
1961                 while (fs_devices) {
1962                         if (fs_devices->seed == cur_devices) {
1963                                 fs_devices->seed = cur_devices->seed;
1964                                 break;
1965                         }
1966                         fs_devices = fs_devices->seed;
1967                 }
1968                 cur_devices->seed = NULL;
1969                 close_fs_devices(cur_devices);
1970                 free_fs_devices(cur_devices);
1971         }
1972
1973 out:
1974         mutex_unlock(&uuid_mutex);
1975         return ret;
1976
1977 error_undo:
1978         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1979                 mutex_lock(&fs_info->chunk_mutex);
1980                 list_add(&device->dev_alloc_list,
1981                          &fs_devices->alloc_list);
1982                 device->fs_devices->rw_devices++;
1983                 mutex_unlock(&fs_info->chunk_mutex);
1984         }
1985         goto out;
1986 }
1987
1988 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1989                                         struct btrfs_device *srcdev)
1990 {
1991         struct btrfs_fs_devices *fs_devices;
1992
1993         lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
1994
1995         /*
1996          * in case of fs with no seed, srcdev->fs_devices will point
1997          * to fs_devices of fs_info. However when the dev being replaced is
1998          * a seed dev it will point to the seed's local fs_devices. In short
1999          * srcdev will have its correct fs_devices in both the cases.
2000          */
2001         fs_devices = srcdev->fs_devices;
2002
2003         list_del_rcu(&srcdev->dev_list);
2004         list_del(&srcdev->dev_alloc_list);
2005         fs_devices->num_devices--;
2006         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2007                 fs_devices->missing_devices--;
2008
2009         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2010                 fs_devices->rw_devices--;
2011
2012         if (srcdev->bdev)
2013                 fs_devices->open_devices--;
2014 }
2015
2016 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2017                                       struct btrfs_device *srcdev)
2018 {
2019         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2020
2021         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2022                 /* zero out the old super if it is writable */
2023                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2024         }
2025
2026         btrfs_close_bdev(srcdev);
2027         call_rcu(&srcdev->rcu, free_device_rcu);
2028
2029         /* if this is no devs we rather delete the fs_devices */
2030         if (!fs_devices->num_devices) {
2031                 struct btrfs_fs_devices *tmp_fs_devices;
2032
2033                 /*
2034                  * On a mounted FS, num_devices can't be zero unless it's a
2035                  * seed. In case of a seed device being replaced, the replace
2036                  * target added to the sprout FS, so there will be no more
2037                  * device left under the seed FS.
2038                  */
2039                 ASSERT(fs_devices->seeding);
2040
2041                 tmp_fs_devices = fs_info->fs_devices;
2042                 while (tmp_fs_devices) {
2043                         if (tmp_fs_devices->seed == fs_devices) {
2044                                 tmp_fs_devices->seed = fs_devices->seed;
2045                                 break;
2046                         }
2047                         tmp_fs_devices = tmp_fs_devices->seed;
2048                 }
2049                 fs_devices->seed = NULL;
2050                 close_fs_devices(fs_devices);
2051                 free_fs_devices(fs_devices);
2052         }
2053 }
2054
2055 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2056                                       struct btrfs_device *tgtdev)
2057 {
2058         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2059
2060         WARN_ON(!tgtdev);
2061         mutex_lock(&fs_devices->device_list_mutex);
2062
2063         btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2064
2065         if (tgtdev->bdev)
2066                 fs_devices->open_devices--;
2067
2068         fs_devices->num_devices--;
2069
2070         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2071
2072         list_del_rcu(&tgtdev->dev_list);
2073
2074         mutex_unlock(&fs_devices->device_list_mutex);
2075
2076         /*
2077          * The update_dev_time() with in btrfs_scratch_superblocks()
2078          * may lead to a call to btrfs_show_devname() which will try
2079          * to hold device_list_mutex. And here this device
2080          * is already out of device list, so we don't have to hold
2081          * the device_list_mutex lock.
2082          */
2083         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2084
2085         btrfs_close_bdev(tgtdev);
2086         call_rcu(&tgtdev->rcu, free_device_rcu);
2087 }
2088
2089 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2090                                      const char *device_path,
2091                                      struct btrfs_device **device)
2092 {
2093         int ret = 0;
2094         struct btrfs_super_block *disk_super;
2095         u64 devid;
2096         u8 *dev_uuid;
2097         struct block_device *bdev;
2098         struct buffer_head *bh;
2099
2100         *device = NULL;
2101         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2102                                     fs_info->bdev_holder, 0, &bdev, &bh);
2103         if (ret)
2104                 return ret;
2105         disk_super = (struct btrfs_super_block *)bh->b_data;
2106         devid = btrfs_stack_device_id(&disk_super->dev_item);
2107         dev_uuid = disk_super->dev_item.uuid;
2108         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2109         brelse(bh);
2110         if (!*device)
2111                 ret = -ENOENT;
2112         blkdev_put(bdev, FMODE_READ);
2113         return ret;
2114 }
2115
2116 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2117                                          const char *device_path,
2118                                          struct btrfs_device **device)
2119 {
2120         *device = NULL;
2121         if (strcmp(device_path, "missing") == 0) {
2122                 struct list_head *devices;
2123                 struct btrfs_device *tmp;
2124
2125                 devices = &fs_info->fs_devices->devices;
2126                 list_for_each_entry(tmp, devices, dev_list) {
2127                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2128                                         &tmp->dev_state) && !tmp->bdev) {
2129                                 *device = tmp;
2130                                 break;
2131                         }
2132                 }
2133
2134                 if (!*device)
2135                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2136
2137                 return 0;
2138         } else {
2139                 return btrfs_find_device_by_path(fs_info, device_path, device);
2140         }
2141 }
2142
2143 /*
2144  * Lookup a device given by device id, or the path if the id is 0.
2145  */
2146 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2147                                  const char *devpath,
2148                                  struct btrfs_device **device)
2149 {
2150         int ret;
2151
2152         if (devid) {
2153                 ret = 0;
2154                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2155                 if (!*device)
2156                         ret = -ENOENT;
2157         } else {
2158                 if (!devpath || !devpath[0])
2159                         return -EINVAL;
2160
2161                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2162                                                            device);
2163         }
2164         return ret;
2165 }
2166
2167 /*
2168  * does all the dirty work required for changing file system's UUID.
2169  */
2170 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2171 {
2172         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2173         struct btrfs_fs_devices *old_devices;
2174         struct btrfs_fs_devices *seed_devices;
2175         struct btrfs_super_block *disk_super = fs_info->super_copy;
2176         struct btrfs_device *device;
2177         u64 super_flags;
2178
2179         lockdep_assert_held(&uuid_mutex);
2180         if (!fs_devices->seeding)
2181                 return -EINVAL;
2182
2183         seed_devices = alloc_fs_devices(NULL);
2184         if (IS_ERR(seed_devices))
2185                 return PTR_ERR(seed_devices);
2186
2187         old_devices = clone_fs_devices(fs_devices);
2188         if (IS_ERR(old_devices)) {
2189                 kfree(seed_devices);
2190                 return PTR_ERR(old_devices);
2191         }
2192
2193         list_add(&old_devices->fs_list, &fs_uuids);
2194
2195         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2196         seed_devices->opened = 1;
2197         INIT_LIST_HEAD(&seed_devices->devices);
2198         INIT_LIST_HEAD(&seed_devices->alloc_list);
2199         mutex_init(&seed_devices->device_list_mutex);
2200
2201         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2202         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2203                               synchronize_rcu);
2204         list_for_each_entry(device, &seed_devices->devices, dev_list)
2205                 device->fs_devices = seed_devices;
2206
2207         mutex_lock(&fs_info->chunk_mutex);
2208         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2209         mutex_unlock(&fs_info->chunk_mutex);
2210
2211         fs_devices->seeding = 0;
2212         fs_devices->num_devices = 0;
2213         fs_devices->open_devices = 0;
2214         fs_devices->missing_devices = 0;
2215         fs_devices->rotating = 0;
2216         fs_devices->seed = seed_devices;
2217
2218         generate_random_uuid(fs_devices->fsid);
2219         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2220         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2221         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2222
2223         super_flags = btrfs_super_flags(disk_super) &
2224                       ~BTRFS_SUPER_FLAG_SEEDING;
2225         btrfs_set_super_flags(disk_super, super_flags);
2226
2227         return 0;
2228 }
2229
2230 /*
2231  * Store the expected generation for seed devices in device items.
2232  */
2233 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2234                                struct btrfs_fs_info *fs_info)
2235 {
2236         struct btrfs_root *root = fs_info->chunk_root;
2237         struct btrfs_path *path;
2238         struct extent_buffer *leaf;
2239         struct btrfs_dev_item *dev_item;
2240         struct btrfs_device *device;
2241         struct btrfs_key key;
2242         u8 fs_uuid[BTRFS_FSID_SIZE];
2243         u8 dev_uuid[BTRFS_UUID_SIZE];
2244         u64 devid;
2245         int ret;
2246
2247         path = btrfs_alloc_path();
2248         if (!path)
2249                 return -ENOMEM;
2250
2251         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2252         key.offset = 0;
2253         key.type = BTRFS_DEV_ITEM_KEY;
2254
2255         while (1) {
2256                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2257                 if (ret < 0)
2258                         goto error;
2259
2260                 leaf = path->nodes[0];
2261 next_slot:
2262                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2263                         ret = btrfs_next_leaf(root, path);
2264                         if (ret > 0)
2265                                 break;
2266                         if (ret < 0)
2267                                 goto error;
2268                         leaf = path->nodes[0];
2269                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2270                         btrfs_release_path(path);
2271                         continue;
2272                 }
2273
2274                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2275                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2276                     key.type != BTRFS_DEV_ITEM_KEY)
2277                         break;
2278
2279                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2280                                           struct btrfs_dev_item);
2281                 devid = btrfs_device_id(leaf, dev_item);
2282                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2283                                    BTRFS_UUID_SIZE);
2284                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2285                                    BTRFS_FSID_SIZE);
2286                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2287                 BUG_ON(!device); /* Logic error */
2288
2289                 if (device->fs_devices->seeding) {
2290                         btrfs_set_device_generation(leaf, dev_item,
2291                                                     device->generation);
2292                         btrfs_mark_buffer_dirty(leaf);
2293                 }
2294
2295                 path->slots[0]++;
2296                 goto next_slot;
2297         }
2298         ret = 0;
2299 error:
2300         btrfs_free_path(path);
2301         return ret;
2302 }
2303
2304 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2305 {
2306         struct btrfs_root *root = fs_info->dev_root;
2307         struct request_queue *q;
2308         struct btrfs_trans_handle *trans;
2309         struct btrfs_device *device;
2310         struct block_device *bdev;
2311         struct super_block *sb = fs_info->sb;
2312         struct rcu_string *name;
2313         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2314         u64 tmp;
2315         int seeding_dev = 0;
2316         int ret = 0;
2317         bool unlocked = false;
2318
2319         if (sb_rdonly(sb) && !fs_devices->seeding)
2320                 return -EROFS;
2321
2322         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2323                                   fs_info->bdev_holder);
2324         if (IS_ERR(bdev))
2325                 return PTR_ERR(bdev);
2326
2327         if (fs_devices->seeding) {
2328                 seeding_dev = 1;
2329                 down_write(&sb->s_umount);
2330                 mutex_lock(&uuid_mutex);
2331         }
2332
2333         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2334
2335         mutex_lock(&fs_devices->device_list_mutex);
2336         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2337                 if (device->bdev == bdev) {
2338                         ret = -EEXIST;
2339                         mutex_unlock(
2340                                 &fs_devices->device_list_mutex);
2341                         goto error;
2342                 }
2343         }
2344         mutex_unlock(&fs_devices->device_list_mutex);
2345
2346         device = btrfs_alloc_device(fs_info, NULL, NULL);
2347         if (IS_ERR(device)) {
2348                 /* we can safely leave the fs_devices entry around */
2349                 ret = PTR_ERR(device);
2350                 goto error;
2351         }
2352
2353         name = rcu_string_strdup(device_path, GFP_KERNEL);
2354         if (!name) {
2355                 ret = -ENOMEM;
2356                 goto error_free_device;
2357         }
2358         rcu_assign_pointer(device->name, name);
2359
2360         trans = btrfs_start_transaction(root, 0);
2361         if (IS_ERR(trans)) {
2362                 ret = PTR_ERR(trans);
2363                 goto error_free_device;
2364         }
2365
2366         q = bdev_get_queue(bdev);
2367         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2368         device->generation = trans->transid;
2369         device->io_width = fs_info->sectorsize;
2370         device->io_align = fs_info->sectorsize;
2371         device->sector_size = fs_info->sectorsize;
2372         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2373                                          fs_info->sectorsize);
2374         device->disk_total_bytes = device->total_bytes;
2375         device->commit_total_bytes = device->total_bytes;
2376         device->fs_info = fs_info;
2377         device->bdev = bdev;
2378         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2379         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2380         device->mode = FMODE_EXCL;
2381         device->dev_stats_valid = 1;
2382         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2383
2384         if (seeding_dev) {
2385                 sb->s_flags &= ~SB_RDONLY;
2386                 ret = btrfs_prepare_sprout(fs_info);
2387                 if (ret) {
2388                         btrfs_abort_transaction(trans, ret);
2389                         goto error_trans;
2390                 }
2391         }
2392
2393         device->fs_devices = fs_devices;
2394
2395         mutex_lock(&fs_devices->device_list_mutex);
2396         mutex_lock(&fs_info->chunk_mutex);
2397         list_add_rcu(&device->dev_list, &fs_devices->devices);
2398         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2399         fs_devices->num_devices++;
2400         fs_devices->open_devices++;
2401         fs_devices->rw_devices++;
2402         fs_devices->total_devices++;
2403         fs_devices->total_rw_bytes += device->total_bytes;
2404
2405         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2406
2407         if (!blk_queue_nonrot(q))
2408                 fs_devices->rotating = 1;
2409
2410         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2411         btrfs_set_super_total_bytes(fs_info->super_copy,
2412                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2413
2414         tmp = btrfs_super_num_devices(fs_info->super_copy);
2415         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2416
2417         /* add sysfs device entry */
2418         btrfs_sysfs_add_device_link(fs_devices, device);
2419
2420         /*
2421          * we've got more storage, clear any full flags on the space
2422          * infos
2423          */
2424         btrfs_clear_space_info_full(fs_info);
2425
2426         mutex_unlock(&fs_info->chunk_mutex);
2427         mutex_unlock(&fs_devices->device_list_mutex);
2428
2429         if (seeding_dev) {
2430                 mutex_lock(&fs_info->chunk_mutex);
2431                 ret = init_first_rw_device(trans, fs_info);
2432                 mutex_unlock(&fs_info->chunk_mutex);
2433                 if (ret) {
2434                         btrfs_abort_transaction(trans, ret);
2435                         goto error_sysfs;
2436                 }
2437         }
2438
2439         ret = btrfs_add_dev_item(trans, fs_info, device);
2440         if (ret) {
2441                 btrfs_abort_transaction(trans, ret);
2442                 goto error_sysfs;
2443         }
2444
2445         if (seeding_dev) {
2446                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2447
2448                 ret = btrfs_finish_sprout(trans, fs_info);
2449                 if (ret) {
2450                         btrfs_abort_transaction(trans, ret);
2451                         goto error_sysfs;
2452                 }
2453
2454                 /* Sprouting would change fsid of the mounted root,
2455                  * so rename the fsid on the sysfs
2456                  */
2457                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2458                                                 fs_info->fsid);
2459                 if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2460                         btrfs_warn(fs_info,
2461                                    "sysfs: failed to create fsid for sprout");
2462         }
2463
2464         ret = btrfs_commit_transaction(trans);
2465
2466         if (seeding_dev) {
2467                 mutex_unlock(&uuid_mutex);
2468                 up_write(&sb->s_umount);
2469                 unlocked = true;
2470
2471                 if (ret) /* transaction commit */
2472                         return ret;
2473
2474                 ret = btrfs_relocate_sys_chunks(fs_info);
2475                 if (ret < 0)
2476                         btrfs_handle_fs_error(fs_info, ret,
2477                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2478                 trans = btrfs_attach_transaction(root);
2479                 if (IS_ERR(trans)) {
2480                         if (PTR_ERR(trans) == -ENOENT)
2481                                 return 0;
2482                         ret = PTR_ERR(trans);
2483                         trans = NULL;
2484                         goto error_sysfs;
2485                 }
2486                 ret = btrfs_commit_transaction(trans);
2487         }
2488
2489         /* Update ctime/mtime for libblkid */
2490         update_dev_time(device_path);
2491         return ret;
2492
2493 error_sysfs:
2494         btrfs_sysfs_rm_device_link(fs_devices, device);
2495 error_trans:
2496         if (seeding_dev)
2497                 sb->s_flags |= SB_RDONLY;
2498         if (trans)
2499                 btrfs_end_transaction(trans);
2500 error_free_device:
2501         btrfs_free_device(device);
2502 error:
2503         blkdev_put(bdev, FMODE_EXCL);
2504         if (seeding_dev && !unlocked) {
2505                 mutex_unlock(&uuid_mutex);
2506                 up_write(&sb->s_umount);
2507         }
2508         return ret;
2509 }
2510
2511 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2512                                         struct btrfs_device *device)
2513 {
2514         int ret;
2515         struct btrfs_path *path;
2516         struct btrfs_root *root = device->fs_info->chunk_root;
2517         struct btrfs_dev_item *dev_item;
2518         struct extent_buffer *leaf;
2519         struct btrfs_key key;
2520
2521         path = btrfs_alloc_path();
2522         if (!path)
2523                 return -ENOMEM;
2524
2525         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2526         key.type = BTRFS_DEV_ITEM_KEY;
2527         key.offset = device->devid;
2528
2529         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2530         if (ret < 0)
2531                 goto out;
2532
2533         if (ret > 0) {
2534                 ret = -ENOENT;
2535                 goto out;
2536         }
2537
2538         leaf = path->nodes[0];
2539         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2540
2541         btrfs_set_device_id(leaf, dev_item, device->devid);
2542         btrfs_set_device_type(leaf, dev_item, device->type);
2543         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2544         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2545         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2546         btrfs_set_device_total_bytes(leaf, dev_item,
2547                                      btrfs_device_get_disk_total_bytes(device));
2548         btrfs_set_device_bytes_used(leaf, dev_item,
2549                                     btrfs_device_get_bytes_used(device));
2550         btrfs_mark_buffer_dirty(leaf);
2551
2552 out:
2553         btrfs_free_path(path);
2554         return ret;
2555 }
2556
2557 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2558                       struct btrfs_device *device, u64 new_size)
2559 {
2560         struct btrfs_fs_info *fs_info = device->fs_info;
2561         struct btrfs_super_block *super_copy = fs_info->super_copy;
2562         struct btrfs_fs_devices *fs_devices;
2563         u64 old_total;
2564         u64 diff;
2565
2566         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2567                 return -EACCES;
2568
2569         new_size = round_down(new_size, fs_info->sectorsize);
2570
2571         mutex_lock(&fs_info->chunk_mutex);
2572         old_total = btrfs_super_total_bytes(super_copy);
2573         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2574
2575         if (new_size <= device->total_bytes ||
2576             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2577                 mutex_unlock(&fs_info->chunk_mutex);
2578                 return -EINVAL;
2579         }
2580
2581         fs_devices = fs_info->fs_devices;
2582
2583         btrfs_set_super_total_bytes(super_copy,
2584                         round_down(old_total + diff, fs_info->sectorsize));
2585         device->fs_devices->total_rw_bytes += diff;
2586
2587         btrfs_device_set_total_bytes(device, new_size);
2588         btrfs_device_set_disk_total_bytes(device, new_size);
2589         btrfs_clear_space_info_full(device->fs_info);
2590         if (list_empty(&device->resized_list))
2591                 list_add_tail(&device->resized_list,
2592                               &fs_devices->resized_devices);
2593         mutex_unlock(&fs_info->chunk_mutex);
2594
2595         return btrfs_update_device(trans, device);
2596 }
2597
2598 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2599                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2600 {
2601         struct btrfs_root *root = fs_info->chunk_root;
2602         int ret;
2603         struct btrfs_path *path;
2604         struct btrfs_key key;
2605
2606         path = btrfs_alloc_path();
2607         if (!path)
2608                 return -ENOMEM;
2609
2610         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2611         key.offset = chunk_offset;
2612         key.type = BTRFS_CHUNK_ITEM_KEY;
2613
2614         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2615         if (ret < 0)
2616                 goto out;
2617         else if (ret > 0) { /* Logic error or corruption */
2618                 btrfs_handle_fs_error(fs_info, -ENOENT,
2619                                       "Failed lookup while freeing chunk.");
2620                 ret = -ENOENT;
2621                 goto out;
2622         }
2623
2624         ret = btrfs_del_item(trans, root, path);
2625         if (ret < 0)
2626                 btrfs_handle_fs_error(fs_info, ret,
2627                                       "Failed to delete chunk item.");
2628 out:
2629         btrfs_free_path(path);
2630         return ret;
2631 }
2632
2633 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2634 {
2635         struct btrfs_super_block *super_copy = fs_info->super_copy;
2636         struct btrfs_disk_key *disk_key;
2637         struct btrfs_chunk *chunk;
2638         u8 *ptr;
2639         int ret = 0;
2640         u32 num_stripes;
2641         u32 array_size;
2642         u32 len = 0;
2643         u32 cur;
2644         struct btrfs_key key;
2645
2646         mutex_lock(&fs_info->chunk_mutex);
2647         array_size = btrfs_super_sys_array_size(super_copy);
2648
2649         ptr = super_copy->sys_chunk_array;
2650         cur = 0;
2651
2652         while (cur < array_size) {
2653                 disk_key = (struct btrfs_disk_key *)ptr;
2654                 btrfs_disk_key_to_cpu(&key, disk_key);
2655
2656                 len = sizeof(*disk_key);
2657
2658                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2659                         chunk = (struct btrfs_chunk *)(ptr + len);
2660                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2661                         len += btrfs_chunk_item_size(num_stripes);
2662                 } else {
2663                         ret = -EIO;
2664                         break;
2665                 }
2666                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2667                     key.offset == chunk_offset) {
2668                         memmove(ptr, ptr + len, array_size - (cur + len));
2669                         array_size -= len;
2670                         btrfs_set_super_sys_array_size(super_copy, array_size);
2671                 } else {
2672                         ptr += len;
2673                         cur += len;
2674                 }
2675         }
2676         mutex_unlock(&fs_info->chunk_mutex);
2677         return ret;
2678 }
2679
2680 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2681                                         u64 logical, u64 length)
2682 {
2683         struct extent_map_tree *em_tree;
2684         struct extent_map *em;
2685
2686         em_tree = &fs_info->mapping_tree.map_tree;
2687         read_lock(&em_tree->lock);
2688         em = lookup_extent_mapping(em_tree, logical, length);
2689         read_unlock(&em_tree->lock);
2690
2691         if (!em) {
2692                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2693                            logical, length);
2694                 return ERR_PTR(-EINVAL);
2695         }
2696
2697         if (em->start > logical || em->start + em->len < logical) {
2698                 btrfs_crit(fs_info,
2699                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2700                            logical, length, em->start, em->start + em->len);
2701                 free_extent_map(em);
2702                 return ERR_PTR(-EINVAL);
2703         }
2704
2705         /* callers are responsible for dropping em's ref. */
2706         return em;
2707 }
2708
2709 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2710                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2711 {
2712         struct extent_map *em;
2713         struct map_lookup *map;
2714         u64 dev_extent_len = 0;
2715         int i, ret = 0;
2716         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2717
2718         em = get_chunk_map(fs_info, chunk_offset, 1);
2719         if (IS_ERR(em)) {
2720                 /*
2721                  * This is a logic error, but we don't want to just rely on the
2722                  * user having built with ASSERT enabled, so if ASSERT doesn't
2723                  * do anything we still error out.
2724                  */
2725                 ASSERT(0);
2726                 return PTR_ERR(em);
2727         }
2728         map = em->map_lookup;
2729         mutex_lock(&fs_info->chunk_mutex);
2730         check_system_chunk(trans, map->type);
2731         mutex_unlock(&fs_info->chunk_mutex);
2732
2733         /*
2734          * Take the device list mutex to prevent races with the final phase of
2735          * a device replace operation that replaces the device object associated
2736          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2737          */
2738         mutex_lock(&fs_devices->device_list_mutex);
2739         for (i = 0; i < map->num_stripes; i++) {
2740                 struct btrfs_device *device = map->stripes[i].dev;
2741                 ret = btrfs_free_dev_extent(trans, device,
2742                                             map->stripes[i].physical,
2743                                             &dev_extent_len);
2744                 if (ret) {
2745                         mutex_unlock(&fs_devices->device_list_mutex);
2746                         btrfs_abort_transaction(trans, ret);
2747                         goto out;
2748                 }
2749
2750                 if (device->bytes_used > 0) {
2751                         mutex_lock(&fs_info->chunk_mutex);
2752                         btrfs_device_set_bytes_used(device,
2753                                         device->bytes_used - dev_extent_len);
2754                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2755                         btrfs_clear_space_info_full(fs_info);
2756                         mutex_unlock(&fs_info->chunk_mutex);
2757                 }
2758
2759                 if (map->stripes[i].dev) {
2760                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2761                         if (ret) {
2762                                 mutex_unlock(&fs_devices->device_list_mutex);
2763                                 btrfs_abort_transaction(trans, ret);
2764                                 goto out;
2765                         }
2766                 }
2767         }
2768         mutex_unlock(&fs_devices->device_list_mutex);
2769
2770         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2771         if (ret) {
2772                 btrfs_abort_transaction(trans, ret);
2773                 goto out;
2774         }
2775
2776         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2777
2778         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2779                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2780                 if (ret) {
2781                         btrfs_abort_transaction(trans, ret);
2782                         goto out;
2783                 }
2784         }
2785
2786         ret = btrfs_remove_block_group(trans, chunk_offset, em);
2787         if (ret) {
2788                 btrfs_abort_transaction(trans, ret);
2789                 goto out;
2790         }
2791
2792 out:
2793         /* once for us */
2794         free_extent_map(em);
2795         return ret;
2796 }
2797
2798 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2799 {
2800         struct btrfs_root *root = fs_info->chunk_root;
2801         struct btrfs_trans_handle *trans;
2802         int ret;
2803
2804         /*
2805          * Prevent races with automatic removal of unused block groups.
2806          * After we relocate and before we remove the chunk with offset
2807          * chunk_offset, automatic removal of the block group can kick in,
2808          * resulting in a failure when calling btrfs_remove_chunk() below.
2809          *
2810          * Make sure to acquire this mutex before doing a tree search (dev
2811          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2812          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2813          * we release the path used to search the chunk/dev tree and before
2814          * the current task acquires this mutex and calls us.
2815          */
2816         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2817
2818         ret = btrfs_can_relocate(fs_info, chunk_offset);
2819         if (ret)
2820                 return -ENOSPC;
2821
2822         /* step one, relocate all the extents inside this chunk */
2823         btrfs_scrub_pause(fs_info);
2824         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2825         btrfs_scrub_continue(fs_info);
2826         if (ret)
2827                 return ret;
2828
2829         /*
2830          * We add the kobjects here (and after forcing data chunk creation)
2831          * since relocation is the only place we'll create chunks of a new
2832          * type at runtime.  The only place where we'll remove the last
2833          * chunk of a type is the call immediately below this one.  Even
2834          * so, we're protected against races with the cleaner thread since
2835          * we're covered by the delete_unused_bgs_mutex.
2836          */
2837         btrfs_add_raid_kobjects(fs_info);
2838
2839         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2840                                                      chunk_offset);
2841         if (IS_ERR(trans)) {
2842                 ret = PTR_ERR(trans);
2843                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2844                 return ret;
2845         }
2846
2847         /*
2848          * step two, delete the device extents and the
2849          * chunk tree entries
2850          */
2851         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2852         btrfs_end_transaction(trans);
2853         return ret;
2854 }
2855
2856 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2857 {
2858         struct btrfs_root *chunk_root = fs_info->chunk_root;
2859         struct btrfs_path *path;
2860         struct extent_buffer *leaf;
2861         struct btrfs_chunk *chunk;
2862         struct btrfs_key key;
2863         struct btrfs_key found_key;
2864         u64 chunk_type;
2865         bool retried = false;
2866         int failed = 0;
2867         int ret;
2868
2869         path = btrfs_alloc_path();
2870         if (!path)
2871                 return -ENOMEM;
2872
2873 again:
2874         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2875         key.offset = (u64)-1;
2876         key.type = BTRFS_CHUNK_ITEM_KEY;
2877
2878         while (1) {
2879                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2880                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2881                 if (ret < 0) {
2882                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2883                         goto error;
2884                 }
2885                 BUG_ON(ret == 0); /* Corruption */
2886
2887                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2888                                           key.type);
2889                 if (ret)
2890                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2891                 if (ret < 0)
2892                         goto error;
2893                 if (ret > 0)
2894                         break;
2895
2896                 leaf = path->nodes[0];
2897                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2898
2899                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2900                                        struct btrfs_chunk);
2901                 chunk_type = btrfs_chunk_type(leaf, chunk);
2902                 btrfs_release_path(path);
2903
2904                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2905                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
2906                         if (ret == -ENOSPC)
2907                                 failed++;
2908                         else
2909                                 BUG_ON(ret);
2910                 }
2911                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2912
2913                 if (found_key.offset == 0)
2914                         break;
2915                 key.offset = found_key.offset - 1;
2916         }
2917         ret = 0;
2918         if (failed && !retried) {
2919                 failed = 0;
2920                 retried = true;
2921                 goto again;
2922         } else if (WARN_ON(failed && retried)) {
2923                 ret = -ENOSPC;
2924         }
2925 error:
2926         btrfs_free_path(path);
2927         return ret;
2928 }
2929
2930 /*
2931  * return 1 : allocate a data chunk successfully,
2932  * return <0: errors during allocating a data chunk,
2933  * return 0 : no need to allocate a data chunk.
2934  */
2935 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
2936                                       u64 chunk_offset)
2937 {
2938         struct btrfs_block_group_cache *cache;
2939         u64 bytes_used;
2940         u64 chunk_type;
2941
2942         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2943         ASSERT(cache);
2944         chunk_type = cache->flags;
2945         btrfs_put_block_group(cache);
2946
2947         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
2948                 spin_lock(&fs_info->data_sinfo->lock);
2949                 bytes_used = fs_info->data_sinfo->bytes_used;
2950                 spin_unlock(&fs_info->data_sinfo->lock);
2951
2952                 if (!bytes_used) {
2953                         struct btrfs_trans_handle *trans;
2954                         int ret;
2955
2956                         trans = btrfs_join_transaction(fs_info->tree_root);
2957                         if (IS_ERR(trans))
2958                                 return PTR_ERR(trans);
2959
2960                         ret = btrfs_force_chunk_alloc(trans,
2961                                                       BTRFS_BLOCK_GROUP_DATA);
2962                         btrfs_end_transaction(trans);
2963                         if (ret < 0)
2964                                 return ret;
2965
2966                         btrfs_add_raid_kobjects(fs_info);
2967
2968                         return 1;
2969                 }
2970         }
2971         return 0;
2972 }
2973
2974 static int insert_balance_item(struct btrfs_fs_info *fs_info,
2975                                struct btrfs_balance_control *bctl)
2976 {
2977         struct btrfs_root *root = fs_info->tree_root;
2978         struct btrfs_trans_handle *trans;
2979         struct btrfs_balance_item *item;
2980         struct btrfs_disk_balance_args disk_bargs;
2981         struct btrfs_path *path;
2982         struct extent_buffer *leaf;
2983         struct btrfs_key key;
2984         int ret, err;
2985
2986         path = btrfs_alloc_path();
2987         if (!path)
2988                 return -ENOMEM;
2989
2990         trans = btrfs_start_transaction(root, 0);
2991         if (IS_ERR(trans)) {
2992                 btrfs_free_path(path);
2993                 return PTR_ERR(trans);
2994         }
2995
2996         key.objectid = BTRFS_BALANCE_OBJECTID;
2997         key.type = BTRFS_TEMPORARY_ITEM_KEY;
2998         key.offset = 0;
2999
3000         ret = btrfs_insert_empty_item(trans, root, path, &key,
3001                                       sizeof(*item));
3002         if (ret)
3003                 goto out;
3004
3005         leaf = path->nodes[0];
3006         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3007
3008         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3009
3010         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3011         btrfs_set_balance_data(leaf, item, &disk_bargs);
3012         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3013         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3014         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3015         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3016
3017         btrfs_set_balance_flags(leaf, item, bctl->flags);
3018
3019         btrfs_mark_buffer_dirty(leaf);
3020 out:
3021         btrfs_free_path(path);
3022         err = btrfs_commit_transaction(trans);
3023         if (err && !ret)
3024                 ret = err;
3025         return ret;
3026 }
3027
3028 static int del_balance_item(struct btrfs_fs_info *fs_info)
3029 {
3030         struct btrfs_root *root = fs_info->tree_root;
3031         struct btrfs_trans_handle *trans;
3032         struct btrfs_path *path;
3033         struct btrfs_key key;
3034         int ret, err;
3035
3036         path = btrfs_alloc_path();
3037         if (!path)
3038                 return -ENOMEM;
3039
3040         trans = btrfs_start_transaction(root, 0);
3041         if (IS_ERR(trans)) {
3042                 btrfs_free_path(path);
3043                 return PTR_ERR(trans);
3044         }
3045
3046         key.objectid = BTRFS_BALANCE_OBJECTID;
3047         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3048         key.offset = 0;
3049
3050         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3051         if (ret < 0)
3052                 goto out;
3053         if (ret > 0) {
3054                 ret = -ENOENT;
3055                 goto out;
3056         }
3057
3058         ret = btrfs_del_item(trans, root, path);
3059 out:
3060         btrfs_free_path(path);
3061         err = btrfs_commit_transaction(trans);
3062         if (err && !ret)
3063                 ret = err;
3064         return ret;
3065 }
3066
3067 /*
3068  * This is a heuristic used to reduce the number of chunks balanced on
3069  * resume after balance was interrupted.
3070  */
3071 static void update_balance_args(struct btrfs_balance_control *bctl)
3072 {
3073         /*
3074          * Turn on soft mode for chunk types that were being converted.
3075          */
3076         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3077                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3078         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3079                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3080         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3081                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3082
3083         /*
3084          * Turn on usage filter if is not already used.  The idea is
3085          * that chunks that we have already balanced should be
3086          * reasonably full.  Don't do it for chunks that are being
3087          * converted - that will keep us from relocating unconverted
3088          * (albeit full) chunks.
3089          */
3090         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3091             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3092             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3093                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3094                 bctl->data.usage = 90;
3095         }
3096         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3097             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3098             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3099                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3100                 bctl->sys.usage = 90;
3101         }
3102         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3103             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3104             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3105                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3106                 bctl->meta.usage = 90;
3107         }
3108 }
3109
3110 /*
3111  * Clear the balance status in fs_info and delete the balance item from disk.
3112  */
3113 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3114 {
3115         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3116         int ret;
3117
3118         BUG_ON(!fs_info->balance_ctl);
3119
3120         spin_lock(&fs_info->balance_lock);
3121         fs_info->balance_ctl = NULL;
3122         spin_unlock(&fs_info->balance_lock);
3123
3124         kfree(bctl);
3125         ret = del_balance_item(fs_info);
3126         if (ret)
3127                 btrfs_handle_fs_error(fs_info, ret, NULL);
3128 }
3129
3130 /*
3131  * Balance filters.  Return 1 if chunk should be filtered out
3132  * (should not be balanced).
3133  */
3134 static int chunk_profiles_filter(u64 chunk_type,
3135                                  struct btrfs_balance_args *bargs)
3136 {
3137         chunk_type = chunk_to_extended(chunk_type) &
3138                                 BTRFS_EXTENDED_PROFILE_MASK;
3139
3140         if (bargs->profiles & chunk_type)
3141                 return 0;
3142
3143         return 1;
3144 }
3145
3146 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3147                               struct btrfs_balance_args *bargs)
3148 {
3149         struct btrfs_block_group_cache *cache;
3150         u64 chunk_used;
3151         u64 user_thresh_min;
3152         u64 user_thresh_max;
3153         int ret = 1;
3154
3155         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3156         chunk_used = btrfs_block_group_used(&cache->item);
3157
3158         if (bargs->usage_min == 0)
3159                 user_thresh_min = 0;
3160         else
3161                 user_thresh_min = div_factor_fine(cache->key.offset,
3162                                         bargs->usage_min);
3163
3164         if (bargs->usage_max == 0)
3165                 user_thresh_max = 1;
3166         else if (bargs->usage_max > 100)
3167                 user_thresh_max = cache->key.offset;
3168         else
3169                 user_thresh_max = div_factor_fine(cache->key.offset,
3170                                         bargs->usage_max);
3171
3172         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3173                 ret = 0;
3174
3175         btrfs_put_block_group(cache);
3176         return ret;
3177 }
3178
3179 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3180                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3181 {
3182         struct btrfs_block_group_cache *cache;
3183         u64 chunk_used, user_thresh;
3184         int ret = 1;
3185
3186         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3187         chunk_used = btrfs_block_group_used(&cache->item);
3188
3189         if (bargs->usage_min == 0)
3190                 user_thresh = 1;
3191         else if (bargs->usage > 100)
3192                 user_thresh = cache->key.offset;
3193         else
3194                 user_thresh = div_factor_fine(cache->key.offset,
3195                                               bargs->usage);
3196
3197         if (chunk_used < user_thresh)
3198                 ret = 0;
3199
3200         btrfs_put_block_group(cache);
3201         return ret;
3202 }
3203
3204 static int chunk_devid_filter(struct extent_buffer *leaf,
3205                               struct btrfs_chunk *chunk,
3206                               struct btrfs_balance_args *bargs)
3207 {
3208         struct btrfs_stripe *stripe;
3209         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3210         int i;
3211
3212         for (i = 0; i < num_stripes; i++) {
3213                 stripe = btrfs_stripe_nr(chunk, i);
3214                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3215                         return 0;
3216         }
3217
3218         return 1;
3219 }
3220
3221 /* [pstart, pend) */
3222 static int chunk_drange_filter(struct extent_buffer *leaf,
3223                                struct btrfs_chunk *chunk,
3224                                struct btrfs_balance_args *bargs)
3225 {
3226         struct btrfs_stripe *stripe;
3227         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3228         u64 stripe_offset;
3229         u64 stripe_length;
3230         int factor;
3231         int i;
3232
3233         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3234                 return 0;
3235
3236         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3237              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3238                 factor = num_stripes / 2;
3239         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3240                 factor = num_stripes - 1;
3241         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3242                 factor = num_stripes - 2;
3243         } else {
3244                 factor = num_stripes;
3245         }
3246
3247         for (i = 0; i < num_stripes; i++) {
3248                 stripe = btrfs_stripe_nr(chunk, i);
3249                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)