btrfs: remove wrong use of volume_mutex from btrfs_dev_replace_start
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/iocontext.h>
12 #include <linux/capability.h>
13 #include <linux/ratelimit.h>
14 #include <linux/kthread.h>
15 #include <linux/raid/pq.h>
16 #include <linux/semaphore.h>
17 #include <linux/uuid.h>
18 #include <linux/list_sort.h>
19 #include <asm/div64.h>
20 #include "ctree.h"
21 #include "extent_map.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "volumes.h"
26 #include "raid56.h"
27 #include "async-thread.h"
28 #include "check-integrity.h"
29 #include "rcu-string.h"
30 #include "math.h"
31 #include "dev-replace.h"
32 #include "sysfs.h"
33
34 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
35         [BTRFS_RAID_RAID10] = {
36                 .sub_stripes    = 2,
37                 .dev_stripes    = 1,
38                 .devs_max       = 0,    /* 0 == as many as possible */
39                 .devs_min       = 4,
40                 .tolerated_failures = 1,
41                 .devs_increment = 2,
42                 .ncopies        = 2,
43         },
44         [BTRFS_RAID_RAID1] = {
45                 .sub_stripes    = 1,
46                 .dev_stripes    = 1,
47                 .devs_max       = 2,
48                 .devs_min       = 2,
49                 .tolerated_failures = 1,
50                 .devs_increment = 2,
51                 .ncopies        = 2,
52         },
53         [BTRFS_RAID_DUP] = {
54                 .sub_stripes    = 1,
55                 .dev_stripes    = 2,
56                 .devs_max       = 1,
57                 .devs_min       = 1,
58                 .tolerated_failures = 0,
59                 .devs_increment = 1,
60                 .ncopies        = 2,
61         },
62         [BTRFS_RAID_RAID0] = {
63                 .sub_stripes    = 1,
64                 .dev_stripes    = 1,
65                 .devs_max       = 0,
66                 .devs_min       = 2,
67                 .tolerated_failures = 0,
68                 .devs_increment = 1,
69                 .ncopies        = 1,
70         },
71         [BTRFS_RAID_SINGLE] = {
72                 .sub_stripes    = 1,
73                 .dev_stripes    = 1,
74                 .devs_max       = 1,
75                 .devs_min       = 1,
76                 .tolerated_failures = 0,
77                 .devs_increment = 1,
78                 .ncopies        = 1,
79         },
80         [BTRFS_RAID_RAID5] = {
81                 .sub_stripes    = 1,
82                 .dev_stripes    = 1,
83                 .devs_max       = 0,
84                 .devs_min       = 2,
85                 .tolerated_failures = 1,
86                 .devs_increment = 1,
87                 .ncopies        = 2,
88         },
89         [BTRFS_RAID_RAID6] = {
90                 .sub_stripes    = 1,
91                 .dev_stripes    = 1,
92                 .devs_max       = 0,
93                 .devs_min       = 3,
94                 .tolerated_failures = 2,
95                 .devs_increment = 1,
96                 .ncopies        = 3,
97         },
98 };
99
100 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
101         [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
102         [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
103         [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
104         [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
105         [BTRFS_RAID_SINGLE] = 0,
106         [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
107         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
108 };
109
110 /*
111  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
112  * condition is not met. Zero means there's no corresponding
113  * BTRFS_ERROR_DEV_*_NOT_MET value.
114  */
115 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
116         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
117         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
118         [BTRFS_RAID_DUP]    = 0,
119         [BTRFS_RAID_RAID0]  = 0,
120         [BTRFS_RAID_SINGLE] = 0,
121         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
122         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
123 };
124
125 static int init_first_rw_device(struct btrfs_trans_handle *trans,
126                                 struct btrfs_fs_info *fs_info);
127 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
128 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
129 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
130 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
131 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
132                              enum btrfs_map_op op,
133                              u64 logical, u64 *length,
134                              struct btrfs_bio **bbio_ret,
135                              int mirror_num, int need_raid_map);
136
137 /*
138  * Device locking
139  * ==============
140  *
141  * There are several mutexes that protect manipulation of devices and low-level
142  * structures like chunks but not block groups, extents or files
143  *
144  * uuid_mutex (global lock)
145  * ------------------------
146  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
147  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
148  * device) or requested by the device= mount option
149  *
150  * the mutex can be very coarse and can cover long-running operations
151  *
152  * protects: updates to fs_devices counters like missing devices, rw devices,
153  * seeding, structure cloning, openning/closing devices at mount/umount time
154  *
155  * global::fs_devs - add, remove, updates to the global list
156  *
157  * does not protect: manipulation of the fs_devices::devices list!
158  *
159  * btrfs_device::name - renames (write side), read is RCU
160  *
161  * fs_devices::device_list_mutex (per-fs, with RCU)
162  * ------------------------------------------------
163  * protects updates to fs_devices::devices, ie. adding and deleting
164  *
165  * simple list traversal with read-only actions can be done with RCU protection
166  *
167  * may be used to exclude some operations from running concurrently without any
168  * modifications to the list (see write_all_supers)
169  *
170  * volume_mutex
171  * ------------
172  * coarse lock owned by a mounted filesystem; used to exclude some operations
173  * that cannot run in parallel and affect the higher-level properties of the
174  * filesystem like: device add/deleting/resize/replace, or balance
175  *
176  * balance_mutex
177  * -------------
178  * protects balance structures (status, state) and context accessed from
179  * several places (internally, ioctl)
180  *
181  * chunk_mutex
182  * -----------
183  * protects chunks, adding or removing during allocation, trim or when a new
184  * device is added/removed
185  *
186  * cleaner_mutex
187  * -------------
188  * a big lock that is held by the cleaner thread and prevents running subvolume
189  * cleaning together with relocation or delayed iputs
190  *
191  *
192  * Lock nesting
193  * ============
194  *
195  * uuid_mutex
196  *   volume_mutex
197  *     device_list_mutex
198  *       chunk_mutex
199  *     balance_mutex
200  *
201  *
202  * Exclusive operations, BTRFS_FS_EXCL_OP
203  * ======================================
204  *
205  * Maintains the exclusivity of the following operations that apply to the
206  * whole filesystem and cannot run in parallel.
207  *
208  * - Balance (*)
209  * - Device add
210  * - Device remove
211  * - Device replace (*)
212  * - Resize
213  *
214  * The device operations (as above) can be in one of the following states:
215  *
216  * - Running state
217  * - Paused state
218  * - Completed state
219  *
220  * Only device operations marked with (*) can go into the Paused state for the
221  * following reasons:
222  *
223  * - ioctl (only Balance can be Paused through ioctl)
224  * - filesystem remounted as read-only
225  * - filesystem unmounted and mounted as read-only
226  * - system power-cycle and filesystem mounted as read-only
227  * - filesystem or device errors leading to forced read-only
228  *
229  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
230  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
231  * A device operation in Paused or Running state can be canceled or resumed
232  * either by ioctl (Balance only) or when remounted as read-write.
233  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
234  * completed.
235  */
236
237 DEFINE_MUTEX(uuid_mutex);
238 static LIST_HEAD(fs_uuids);
239 struct list_head *btrfs_get_fs_uuids(void)
240 {
241         return &fs_uuids;
242 }
243
244 /*
245  * alloc_fs_devices - allocate struct btrfs_fs_devices
246  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
247  *
248  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
249  * The returned struct is not linked onto any lists and can be destroyed with
250  * kfree() right away.
251  */
252 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
253 {
254         struct btrfs_fs_devices *fs_devs;
255
256         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
257         if (!fs_devs)
258                 return ERR_PTR(-ENOMEM);
259
260         mutex_init(&fs_devs->device_list_mutex);
261
262         INIT_LIST_HEAD(&fs_devs->devices);
263         INIT_LIST_HEAD(&fs_devs->resized_devices);
264         INIT_LIST_HEAD(&fs_devs->alloc_list);
265         INIT_LIST_HEAD(&fs_devs->fs_list);
266         if (fsid)
267                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
268
269         return fs_devs;
270 }
271
272 void btrfs_free_device(struct btrfs_device *device)
273 {
274         rcu_string_free(device->name);
275         bio_put(device->flush_bio);
276         kfree(device);
277 }
278
279 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
280 {
281         struct btrfs_device *device;
282         WARN_ON(fs_devices->opened);
283         while (!list_empty(&fs_devices->devices)) {
284                 device = list_entry(fs_devices->devices.next,
285                                     struct btrfs_device, dev_list);
286                 list_del(&device->dev_list);
287                 btrfs_free_device(device);
288         }
289         kfree(fs_devices);
290 }
291
292 static void btrfs_kobject_uevent(struct block_device *bdev,
293                                  enum kobject_action action)
294 {
295         int ret;
296
297         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
298         if (ret)
299                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
300                         action,
301                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
302                         &disk_to_dev(bdev->bd_disk)->kobj);
303 }
304
305 void __exit btrfs_cleanup_fs_uuids(void)
306 {
307         struct btrfs_fs_devices *fs_devices;
308
309         while (!list_empty(&fs_uuids)) {
310                 fs_devices = list_entry(fs_uuids.next,
311                                         struct btrfs_fs_devices, fs_list);
312                 list_del(&fs_devices->fs_list);
313                 free_fs_devices(fs_devices);
314         }
315 }
316
317 /*
318  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
319  * Returned struct is not linked onto any lists and must be destroyed using
320  * btrfs_free_device.
321  */
322 static struct btrfs_device *__alloc_device(void)
323 {
324         struct btrfs_device *dev;
325
326         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
327         if (!dev)
328                 return ERR_PTR(-ENOMEM);
329
330         /*
331          * Preallocate a bio that's always going to be used for flushing device
332          * barriers and matches the device lifespan
333          */
334         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
335         if (!dev->flush_bio) {
336                 kfree(dev);
337                 return ERR_PTR(-ENOMEM);
338         }
339
340         INIT_LIST_HEAD(&dev->dev_list);
341         INIT_LIST_HEAD(&dev->dev_alloc_list);
342         INIT_LIST_HEAD(&dev->resized_list);
343
344         spin_lock_init(&dev->io_lock);
345
346         atomic_set(&dev->reada_in_flight, 0);
347         atomic_set(&dev->dev_stats_ccnt, 0);
348         btrfs_device_data_ordered_init(dev);
349         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
350         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
351
352         return dev;
353 }
354
355 /*
356  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
357  * return NULL.
358  *
359  * If devid and uuid are both specified, the match must be exact, otherwise
360  * only devid is used.
361  */
362 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
363                 u64 devid, const u8 *uuid)
364 {
365         struct btrfs_device *dev;
366
367         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
368                 if (dev->devid == devid &&
369                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
370                         return dev;
371                 }
372         }
373         return NULL;
374 }
375
376 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
377 {
378         struct btrfs_fs_devices *fs_devices;
379
380         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
381                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
382                         return fs_devices;
383         }
384         return NULL;
385 }
386
387 static int
388 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
389                       int flush, struct block_device **bdev,
390                       struct buffer_head **bh)
391 {
392         int ret;
393
394         *bdev = blkdev_get_by_path(device_path, flags, holder);
395
396         if (IS_ERR(*bdev)) {
397                 ret = PTR_ERR(*bdev);
398                 goto error;
399         }
400
401         if (flush)
402                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
403         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
404         if (ret) {
405                 blkdev_put(*bdev, flags);
406                 goto error;
407         }
408         invalidate_bdev(*bdev);
409         *bh = btrfs_read_dev_super(*bdev);
410         if (IS_ERR(*bh)) {
411                 ret = PTR_ERR(*bh);
412                 blkdev_put(*bdev, flags);
413                 goto error;
414         }
415
416         return 0;
417
418 error:
419         *bdev = NULL;
420         *bh = NULL;
421         return ret;
422 }
423
424 static void requeue_list(struct btrfs_pending_bios *pending_bios,
425                         struct bio *head, struct bio *tail)
426 {
427
428         struct bio *old_head;
429
430         old_head = pending_bios->head;
431         pending_bios->head = head;
432         if (pending_bios->tail)
433                 tail->bi_next = old_head;
434         else
435                 pending_bios->tail = tail;
436 }
437
438 /*
439  * we try to collect pending bios for a device so we don't get a large
440  * number of procs sending bios down to the same device.  This greatly
441  * improves the schedulers ability to collect and merge the bios.
442  *
443  * But, it also turns into a long list of bios to process and that is sure
444  * to eventually make the worker thread block.  The solution here is to
445  * make some progress and then put this work struct back at the end of
446  * the list if the block device is congested.  This way, multiple devices
447  * can make progress from a single worker thread.
448  */
449 static noinline void run_scheduled_bios(struct btrfs_device *device)
450 {
451         struct btrfs_fs_info *fs_info = device->fs_info;
452         struct bio *pending;
453         struct backing_dev_info *bdi;
454         struct btrfs_pending_bios *pending_bios;
455         struct bio *tail;
456         struct bio *cur;
457         int again = 0;
458         unsigned long num_run;
459         unsigned long batch_run = 0;
460         unsigned long last_waited = 0;
461         int force_reg = 0;
462         int sync_pending = 0;
463         struct blk_plug plug;
464
465         /*
466          * this function runs all the bios we've collected for
467          * a particular device.  We don't want to wander off to
468          * another device without first sending all of these down.
469          * So, setup a plug here and finish it off before we return
470          */
471         blk_start_plug(&plug);
472
473         bdi = device->bdev->bd_bdi;
474
475 loop:
476         spin_lock(&device->io_lock);
477
478 loop_lock:
479         num_run = 0;
480
481         /* take all the bios off the list at once and process them
482          * later on (without the lock held).  But, remember the
483          * tail and other pointers so the bios can be properly reinserted
484          * into the list if we hit congestion
485          */
486         if (!force_reg && device->pending_sync_bios.head) {
487                 pending_bios = &device->pending_sync_bios;
488                 force_reg = 1;
489         } else {
490                 pending_bios = &device->pending_bios;
491                 force_reg = 0;
492         }
493
494         pending = pending_bios->head;
495         tail = pending_bios->tail;
496         WARN_ON(pending && !tail);
497
498         /*
499          * if pending was null this time around, no bios need processing
500          * at all and we can stop.  Otherwise it'll loop back up again
501          * and do an additional check so no bios are missed.
502          *
503          * device->running_pending is used to synchronize with the
504          * schedule_bio code.
505          */
506         if (device->pending_sync_bios.head == NULL &&
507             device->pending_bios.head == NULL) {
508                 again = 0;
509                 device->running_pending = 0;
510         } else {
511                 again = 1;
512                 device->running_pending = 1;
513         }
514
515         pending_bios->head = NULL;
516         pending_bios->tail = NULL;
517
518         spin_unlock(&device->io_lock);
519
520         while (pending) {
521
522                 rmb();
523                 /* we want to work on both lists, but do more bios on the
524                  * sync list than the regular list
525                  */
526                 if ((num_run > 32 &&
527                     pending_bios != &device->pending_sync_bios &&
528                     device->pending_sync_bios.head) ||
529                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
530                     device->pending_bios.head)) {
531                         spin_lock(&device->io_lock);
532                         requeue_list(pending_bios, pending, tail);
533                         goto loop_lock;
534                 }
535
536                 cur = pending;
537                 pending = pending->bi_next;
538                 cur->bi_next = NULL;
539
540                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
541
542                 /*
543                  * if we're doing the sync list, record that our
544                  * plug has some sync requests on it
545                  *
546                  * If we're doing the regular list and there are
547                  * sync requests sitting around, unplug before
548                  * we add more
549                  */
550                 if (pending_bios == &device->pending_sync_bios) {
551                         sync_pending = 1;
552                 } else if (sync_pending) {
553                         blk_finish_plug(&plug);
554                         blk_start_plug(&plug);
555                         sync_pending = 0;
556                 }
557
558                 btrfsic_submit_bio(cur);
559                 num_run++;
560                 batch_run++;
561
562                 cond_resched();
563
564                 /*
565                  * we made progress, there is more work to do and the bdi
566                  * is now congested.  Back off and let other work structs
567                  * run instead
568                  */
569                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
570                     fs_info->fs_devices->open_devices > 1) {
571                         struct io_context *ioc;
572
573                         ioc = current->io_context;
574
575                         /*
576                          * the main goal here is that we don't want to
577                          * block if we're going to be able to submit
578                          * more requests without blocking.
579                          *
580                          * This code does two great things, it pokes into
581                          * the elevator code from a filesystem _and_
582                          * it makes assumptions about how batching works.
583                          */
584                         if (ioc && ioc->nr_batch_requests > 0 &&
585                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
586                             (last_waited == 0 ||
587                              ioc->last_waited == last_waited)) {
588                                 /*
589                                  * we want to go through our batch of
590                                  * requests and stop.  So, we copy out
591                                  * the ioc->last_waited time and test
592                                  * against it before looping
593                                  */
594                                 last_waited = ioc->last_waited;
595                                 cond_resched();
596                                 continue;
597                         }
598                         spin_lock(&device->io_lock);
599                         requeue_list(pending_bios, pending, tail);
600                         device->running_pending = 1;
601
602                         spin_unlock(&device->io_lock);
603                         btrfs_queue_work(fs_info->submit_workers,
604                                          &device->work);
605                         goto done;
606                 }
607         }
608
609         cond_resched();
610         if (again)
611                 goto loop;
612
613         spin_lock(&device->io_lock);
614         if (device->pending_bios.head || device->pending_sync_bios.head)
615                 goto loop_lock;
616         spin_unlock(&device->io_lock);
617
618 done:
619         blk_finish_plug(&plug);
620 }
621
622 static void pending_bios_fn(struct btrfs_work *work)
623 {
624         struct btrfs_device *device;
625
626         device = container_of(work, struct btrfs_device, work);
627         run_scheduled_bios(device);
628 }
629
630 /*
631  *  Search and remove all stale (devices which are not mounted) devices.
632  *  When both inputs are NULL, it will search and release all stale devices.
633  *  path:       Optional. When provided will it release all unmounted devices
634  *              matching this path only.
635  *  skip_dev:   Optional. Will skip this device when searching for the stale
636  *              devices.
637  */
638 static void btrfs_free_stale_devices(const char *path,
639                                      struct btrfs_device *skip_dev)
640 {
641         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
642         struct btrfs_device *dev, *tmp_dev;
643
644         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
645
646                 if (fs_devs->opened)
647                         continue;
648
649                 list_for_each_entry_safe(dev, tmp_dev,
650                                          &fs_devs->devices, dev_list) {
651                         int not_found = 0;
652
653                         if (skip_dev && skip_dev == dev)
654                                 continue;
655                         if (path && !dev->name)
656                                 continue;
657
658                         rcu_read_lock();
659                         if (path)
660                                 not_found = strcmp(rcu_str_deref(dev->name),
661                                                    path);
662                         rcu_read_unlock();
663                         if (not_found)
664                                 continue;
665
666                         /* delete the stale device */
667                         if (fs_devs->num_devices == 1) {
668                                 btrfs_sysfs_remove_fsid(fs_devs);
669                                 list_del(&fs_devs->fs_list);
670                                 free_fs_devices(fs_devs);
671                                 break;
672                         } else {
673                                 fs_devs->num_devices--;
674                                 list_del(&dev->dev_list);
675                                 btrfs_free_device(dev);
676                         }
677                 }
678         }
679 }
680
681 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
682                         struct btrfs_device *device, fmode_t flags,
683                         void *holder)
684 {
685         struct request_queue *q;
686         struct block_device *bdev;
687         struct buffer_head *bh;
688         struct btrfs_super_block *disk_super;
689         u64 devid;
690         int ret;
691
692         if (device->bdev)
693                 return -EINVAL;
694         if (!device->name)
695                 return -EINVAL;
696
697         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
698                                     &bdev, &bh);
699         if (ret)
700                 return ret;
701
702         disk_super = (struct btrfs_super_block *)bh->b_data;
703         devid = btrfs_stack_device_id(&disk_super->dev_item);
704         if (devid != device->devid)
705                 goto error_brelse;
706
707         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
708                 goto error_brelse;
709
710         device->generation = btrfs_super_generation(disk_super);
711
712         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
713                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
714                 fs_devices->seeding = 1;
715         } else {
716                 if (bdev_read_only(bdev))
717                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
718                 else
719                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
720         }
721
722         q = bdev_get_queue(bdev);
723         if (!blk_queue_nonrot(q))
724                 fs_devices->rotating = 1;
725
726         device->bdev = bdev;
727         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
728         device->mode = flags;
729
730         fs_devices->open_devices++;
731         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
732             device->devid != BTRFS_DEV_REPLACE_DEVID) {
733                 fs_devices->rw_devices++;
734                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
735         }
736         brelse(bh);
737
738         return 0;
739
740 error_brelse:
741         brelse(bh);
742         blkdev_put(bdev, flags);
743
744         return -EINVAL;
745 }
746
747 /*
748  * Add new device to list of registered devices
749  *
750  * Returns:
751  * device pointer which was just added or updated when successful
752  * error pointer when failed
753  */
754 static noinline struct btrfs_device *device_list_add(const char *path,
755                            struct btrfs_super_block *disk_super)
756 {
757         struct btrfs_device *device;
758         struct btrfs_fs_devices *fs_devices;
759         struct rcu_string *name;
760         u64 found_transid = btrfs_super_generation(disk_super);
761         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
762
763         fs_devices = find_fsid(disk_super->fsid);
764         if (!fs_devices) {
765                 fs_devices = alloc_fs_devices(disk_super->fsid);
766                 if (IS_ERR(fs_devices))
767                         return ERR_CAST(fs_devices);
768
769                 list_add(&fs_devices->fs_list, &fs_uuids);
770
771                 device = NULL;
772         } else {
773                 device = find_device(fs_devices, devid,
774                                 disk_super->dev_item.uuid);
775         }
776
777         if (!device) {
778                 if (fs_devices->opened)
779                         return ERR_PTR(-EBUSY);
780
781                 device = btrfs_alloc_device(NULL, &devid,
782                                             disk_super->dev_item.uuid);
783                 if (IS_ERR(device)) {
784                         /* we can safely leave the fs_devices entry around */
785                         return device;
786                 }
787
788                 name = rcu_string_strdup(path, GFP_NOFS);
789                 if (!name) {
790                         btrfs_free_device(device);
791                         return ERR_PTR(-ENOMEM);
792                 }
793                 rcu_assign_pointer(device->name, name);
794
795                 mutex_lock(&fs_devices->device_list_mutex);
796                 list_add_rcu(&device->dev_list, &fs_devices->devices);
797                 fs_devices->num_devices++;
798                 mutex_unlock(&fs_devices->device_list_mutex);
799
800                 device->fs_devices = fs_devices;
801                 btrfs_free_stale_devices(path, device);
802
803                 if (disk_super->label[0])
804                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
805                                 disk_super->label, devid, found_transid, path);
806                 else
807                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
808                                 disk_super->fsid, devid, found_transid, path);
809
810         } else if (!device->name || strcmp(device->name->str, path)) {
811                 /*
812                  * When FS is already mounted.
813                  * 1. If you are here and if the device->name is NULL that
814                  *    means this device was missing at time of FS mount.
815                  * 2. If you are here and if the device->name is different
816                  *    from 'path' that means either
817                  *      a. The same device disappeared and reappeared with
818                  *         different name. or
819                  *      b. The missing-disk-which-was-replaced, has
820                  *         reappeared now.
821                  *
822                  * We must allow 1 and 2a above. But 2b would be a spurious
823                  * and unintentional.
824                  *
825                  * Further in case of 1 and 2a above, the disk at 'path'
826                  * would have missed some transaction when it was away and
827                  * in case of 2a the stale bdev has to be updated as well.
828                  * 2b must not be allowed at all time.
829                  */
830
831                 /*
832                  * For now, we do allow update to btrfs_fs_device through the
833                  * btrfs dev scan cli after FS has been mounted.  We're still
834                  * tracking a problem where systems fail mount by subvolume id
835                  * when we reject replacement on a mounted FS.
836                  */
837                 if (!fs_devices->opened && found_transid < device->generation) {
838                         /*
839                          * That is if the FS is _not_ mounted and if you
840                          * are here, that means there is more than one
841                          * disk with same uuid and devid.We keep the one
842                          * with larger generation number or the last-in if
843                          * generation are equal.
844                          */
845                         return ERR_PTR(-EEXIST);
846                 }
847
848                 name = rcu_string_strdup(path, GFP_NOFS);
849                 if (!name)
850                         return ERR_PTR(-ENOMEM);
851                 rcu_string_free(device->name);
852                 rcu_assign_pointer(device->name, name);
853                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
854                         fs_devices->missing_devices--;
855                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
856                 }
857         }
858
859         /*
860          * Unmount does not free the btrfs_device struct but would zero
861          * generation along with most of the other members. So just update
862          * it back. We need it to pick the disk with largest generation
863          * (as above).
864          */
865         if (!fs_devices->opened)
866                 device->generation = found_transid;
867
868         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
869
870         return device;
871 }
872
873 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
874 {
875         struct btrfs_fs_devices *fs_devices;
876         struct btrfs_device *device;
877         struct btrfs_device *orig_dev;
878
879         fs_devices = alloc_fs_devices(orig->fsid);
880         if (IS_ERR(fs_devices))
881                 return fs_devices;
882
883         mutex_lock(&orig->device_list_mutex);
884         fs_devices->total_devices = orig->total_devices;
885
886         /* We have held the volume lock, it is safe to get the devices. */
887         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
888                 struct rcu_string *name;
889
890                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
891                                             orig_dev->uuid);
892                 if (IS_ERR(device))
893                         goto error;
894
895                 /*
896                  * This is ok to do without rcu read locked because we hold the
897                  * uuid mutex so nothing we touch in here is going to disappear.
898                  */
899                 if (orig_dev->name) {
900                         name = rcu_string_strdup(orig_dev->name->str,
901                                         GFP_KERNEL);
902                         if (!name) {
903                                 btrfs_free_device(device);
904                                 goto error;
905                         }
906                         rcu_assign_pointer(device->name, name);
907                 }
908
909                 list_add(&device->dev_list, &fs_devices->devices);
910                 device->fs_devices = fs_devices;
911                 fs_devices->num_devices++;
912         }
913         mutex_unlock(&orig->device_list_mutex);
914         return fs_devices;
915 error:
916         mutex_unlock(&orig->device_list_mutex);
917         free_fs_devices(fs_devices);
918         return ERR_PTR(-ENOMEM);
919 }
920
921 /*
922  * After we have read the system tree and know devids belonging to
923  * this filesystem, remove the device which does not belong there.
924  */
925 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
926 {
927         struct btrfs_device *device, *next;
928         struct btrfs_device *latest_dev = NULL;
929
930         mutex_lock(&uuid_mutex);
931 again:
932         /* This is the initialized path, it is safe to release the devices. */
933         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
934                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
935                                                         &device->dev_state)) {
936                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
937                              &device->dev_state) &&
938                              (!latest_dev ||
939                               device->generation > latest_dev->generation)) {
940                                 latest_dev = device;
941                         }
942                         continue;
943                 }
944
945                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
946                         /*
947                          * In the first step, keep the device which has
948                          * the correct fsid and the devid that is used
949                          * for the dev_replace procedure.
950                          * In the second step, the dev_replace state is
951                          * read from the device tree and it is known
952                          * whether the procedure is really active or
953                          * not, which means whether this device is
954                          * used or whether it should be removed.
955                          */
956                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
957                                                   &device->dev_state)) {
958                                 continue;
959                         }
960                 }
961                 if (device->bdev) {
962                         blkdev_put(device->bdev, device->mode);
963                         device->bdev = NULL;
964                         fs_devices->open_devices--;
965                 }
966                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
967                         list_del_init(&device->dev_alloc_list);
968                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
969                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
970                                       &device->dev_state))
971                                 fs_devices->rw_devices--;
972                 }
973                 list_del_init(&device->dev_list);
974                 fs_devices->num_devices--;
975                 btrfs_free_device(device);
976         }
977
978         if (fs_devices->seed) {
979                 fs_devices = fs_devices->seed;
980                 goto again;
981         }
982
983         fs_devices->latest_bdev = latest_dev->bdev;
984
985         mutex_unlock(&uuid_mutex);
986 }
987
988 static void free_device_rcu(struct rcu_head *head)
989 {
990         struct btrfs_device *device;
991
992         device = container_of(head, struct btrfs_device, rcu);
993         btrfs_free_device(device);
994 }
995
996 static void btrfs_close_bdev(struct btrfs_device *device)
997 {
998         if (!device->bdev)
999                 return;
1000
1001         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1002                 sync_blockdev(device->bdev);
1003                 invalidate_bdev(device->bdev);
1004         }
1005
1006         blkdev_put(device->bdev, device->mode);
1007 }
1008
1009 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1010 {
1011         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1012         struct btrfs_device *new_device;
1013         struct rcu_string *name;
1014
1015         if (device->bdev)
1016                 fs_devices->open_devices--;
1017
1018         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1019             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1020                 list_del_init(&device->dev_alloc_list);
1021                 fs_devices->rw_devices--;
1022         }
1023
1024         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1025                 fs_devices->missing_devices--;
1026
1027         new_device = btrfs_alloc_device(NULL, &device->devid,
1028                                         device->uuid);
1029         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1030
1031         /* Safe because we are under uuid_mutex */
1032         if (device->name) {
1033                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1034                 BUG_ON(!name); /* -ENOMEM */
1035                 rcu_assign_pointer(new_device->name, name);
1036         }
1037
1038         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1039         new_device->fs_devices = device->fs_devices;
1040 }
1041
1042 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1043 {
1044         struct btrfs_device *device, *tmp;
1045         struct list_head pending_put;
1046
1047         INIT_LIST_HEAD(&pending_put);
1048
1049         if (--fs_devices->opened > 0)
1050                 return 0;
1051
1052         mutex_lock(&fs_devices->device_list_mutex);
1053         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1054                 btrfs_prepare_close_one_device(device);
1055                 list_add(&device->dev_list, &pending_put);
1056         }
1057         mutex_unlock(&fs_devices->device_list_mutex);
1058
1059         /*
1060          * btrfs_show_devname() is using the device_list_mutex,
1061          * sometimes call to blkdev_put() leads vfs calling
1062          * into this func. So do put outside of device_list_mutex,
1063          * as of now.
1064          */
1065         while (!list_empty(&pending_put)) {
1066                 device = list_first_entry(&pending_put,
1067                                 struct btrfs_device, dev_list);
1068                 list_del(&device->dev_list);
1069                 btrfs_close_bdev(device);
1070                 call_rcu(&device->rcu, free_device_rcu);
1071         }
1072
1073         WARN_ON(fs_devices->open_devices);
1074         WARN_ON(fs_devices->rw_devices);
1075         fs_devices->opened = 0;
1076         fs_devices->seeding = 0;
1077
1078         return 0;
1079 }
1080
1081 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1082 {
1083         struct btrfs_fs_devices *seed_devices = NULL;
1084         int ret;
1085
1086         mutex_lock(&uuid_mutex);
1087         ret = close_fs_devices(fs_devices);
1088         if (!fs_devices->opened) {
1089                 seed_devices = fs_devices->seed;
1090                 fs_devices->seed = NULL;
1091         }
1092         mutex_unlock(&uuid_mutex);
1093
1094         while (seed_devices) {
1095                 fs_devices = seed_devices;
1096                 seed_devices = fs_devices->seed;
1097                 close_fs_devices(fs_devices);
1098                 free_fs_devices(fs_devices);
1099         }
1100         return ret;
1101 }
1102
1103 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1104                                 fmode_t flags, void *holder)
1105 {
1106         struct btrfs_device *device;
1107         struct btrfs_device *latest_dev = NULL;
1108         int ret = 0;
1109
1110         flags |= FMODE_EXCL;
1111
1112         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1113                 /* Just open everything we can; ignore failures here */
1114                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1115                         continue;
1116
1117                 if (!latest_dev ||
1118                     device->generation > latest_dev->generation)
1119                         latest_dev = device;
1120         }
1121         if (fs_devices->open_devices == 0) {
1122                 ret = -EINVAL;
1123                 goto out;
1124         }
1125         fs_devices->opened = 1;
1126         fs_devices->latest_bdev = latest_dev->bdev;
1127         fs_devices->total_rw_bytes = 0;
1128 out:
1129         return ret;
1130 }
1131
1132 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1133 {
1134         struct btrfs_device *dev1, *dev2;
1135
1136         dev1 = list_entry(a, struct btrfs_device, dev_list);
1137         dev2 = list_entry(b, struct btrfs_device, dev_list);
1138
1139         if (dev1->devid < dev2->devid)
1140                 return -1;
1141         else if (dev1->devid > dev2->devid)
1142                 return 1;
1143         return 0;
1144 }
1145
1146 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1147                        fmode_t flags, void *holder)
1148 {
1149         int ret;
1150
1151         mutex_lock(&uuid_mutex);
1152         if (fs_devices->opened) {
1153                 fs_devices->opened++;
1154                 ret = 0;
1155         } else {
1156                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1157                 ret = open_fs_devices(fs_devices, flags, holder);
1158         }
1159         mutex_unlock(&uuid_mutex);
1160         return ret;
1161 }
1162
1163 static void btrfs_release_disk_super(struct page *page)
1164 {
1165         kunmap(page);
1166         put_page(page);
1167 }
1168
1169 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1170                                  struct page **page,
1171                                  struct btrfs_super_block **disk_super)
1172 {
1173         void *p;
1174         pgoff_t index;
1175
1176         /* make sure our super fits in the device */
1177         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1178                 return 1;
1179
1180         /* make sure our super fits in the page */
1181         if (sizeof(**disk_super) > PAGE_SIZE)
1182                 return 1;
1183
1184         /* make sure our super doesn't straddle pages on disk */
1185         index = bytenr >> PAGE_SHIFT;
1186         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1187                 return 1;
1188
1189         /* pull in the page with our super */
1190         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1191                                    index, GFP_KERNEL);
1192
1193         if (IS_ERR_OR_NULL(*page))
1194                 return 1;
1195
1196         p = kmap(*page);
1197
1198         /* align our pointer to the offset of the super block */
1199         *disk_super = p + (bytenr & ~PAGE_MASK);
1200
1201         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1202             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1203                 btrfs_release_disk_super(*page);
1204                 return 1;
1205         }
1206
1207         if ((*disk_super)->label[0] &&
1208                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1209                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1210
1211         return 0;
1212 }
1213
1214 /*
1215  * Look for a btrfs signature on a device. This may be called out of the mount path
1216  * and we are not allowed to call set_blocksize during the scan. The superblock
1217  * is read via pagecache
1218  */
1219 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1220                           struct btrfs_fs_devices **fs_devices_ret)
1221 {
1222         struct btrfs_super_block *disk_super;
1223         struct btrfs_device *device;
1224         struct block_device *bdev;
1225         struct page *page;
1226         int ret = 0;
1227         u64 bytenr;
1228
1229         /*
1230          * we would like to check all the supers, but that would make
1231          * a btrfs mount succeed after a mkfs from a different FS.
1232          * So, we need to add a special mount option to scan for
1233          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1234          */
1235         bytenr = btrfs_sb_offset(0);
1236         flags |= FMODE_EXCL;
1237         mutex_lock(&uuid_mutex);
1238
1239         bdev = blkdev_get_by_path(path, flags, holder);
1240         if (IS_ERR(bdev)) {
1241                 ret = PTR_ERR(bdev);
1242                 goto error;
1243         }
1244
1245         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1246                 ret = -EINVAL;
1247                 goto error_bdev_put;
1248         }
1249
1250         device = device_list_add(path, disk_super);
1251         if (IS_ERR(device))
1252                 ret = PTR_ERR(device);
1253         else
1254                 *fs_devices_ret = device->fs_devices;
1255
1256         btrfs_release_disk_super(page);
1257
1258 error_bdev_put:
1259         blkdev_put(bdev, flags);
1260 error:
1261         mutex_unlock(&uuid_mutex);
1262         return ret;
1263 }
1264
1265 /* helper to account the used device space in the range */
1266 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1267                                    u64 end, u64 *length)
1268 {
1269         struct btrfs_key key;
1270         struct btrfs_root *root = device->fs_info->dev_root;
1271         struct btrfs_dev_extent *dev_extent;
1272         struct btrfs_path *path;
1273         u64 extent_end;
1274         int ret;
1275         int slot;
1276         struct extent_buffer *l;
1277
1278         *length = 0;
1279
1280         if (start >= device->total_bytes ||
1281                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1282                 return 0;
1283
1284         path = btrfs_alloc_path();
1285         if (!path)
1286                 return -ENOMEM;
1287         path->reada = READA_FORWARD;
1288
1289         key.objectid = device->devid;
1290         key.offset = start;
1291         key.type = BTRFS_DEV_EXTENT_KEY;
1292
1293         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1294         if (ret < 0)
1295                 goto out;
1296         if (ret > 0) {
1297                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1298                 if (ret < 0)
1299                         goto out;
1300         }
1301
1302         while (1) {
1303                 l = path->nodes[0];
1304                 slot = path->slots[0];
1305                 if (slot >= btrfs_header_nritems(l)) {
1306                         ret = btrfs_next_leaf(root, path);
1307                         if (ret == 0)
1308                                 continue;
1309                         if (ret < 0)
1310                                 goto out;
1311
1312                         break;
1313                 }
1314                 btrfs_item_key_to_cpu(l, &key, slot);
1315
1316                 if (key.objectid < device->devid)
1317                         goto next;
1318
1319                 if (key.objectid > device->devid)
1320                         break;
1321
1322                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1323                         goto next;
1324
1325                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1326                 extent_end = key.offset + btrfs_dev_extent_length(l,
1327                                                                   dev_extent);
1328                 if (key.offset <= start && extent_end > end) {
1329                         *length = end - start + 1;
1330                         break;
1331                 } else if (key.offset <= start && extent_end > start)
1332                         *length += extent_end - start;
1333                 else if (key.offset > start && extent_end <= end)
1334                         *length += extent_end - key.offset;
1335                 else if (key.offset > start && key.offset <= end) {
1336                         *length += end - key.offset + 1;
1337                         break;
1338                 } else if (key.offset > end)
1339                         break;
1340
1341 next:
1342                 path->slots[0]++;
1343         }
1344         ret = 0;
1345 out:
1346         btrfs_free_path(path);
1347         return ret;
1348 }
1349
1350 static int contains_pending_extent(struct btrfs_transaction *transaction,
1351                                    struct btrfs_device *device,
1352                                    u64 *start, u64 len)
1353 {
1354         struct btrfs_fs_info *fs_info = device->fs_info;
1355         struct extent_map *em;
1356         struct list_head *search_list = &fs_info->pinned_chunks;
1357         int ret = 0;
1358         u64 physical_start = *start;
1359
1360         if (transaction)
1361                 search_list = &transaction->pending_chunks;
1362 again:
1363         list_for_each_entry(em, search_list, list) {
1364                 struct map_lookup *map;
1365                 int i;
1366
1367                 map = em->map_lookup;
1368                 for (i = 0; i < map->num_stripes; i++) {
1369                         u64 end;
1370
1371                         if (map->stripes[i].dev != device)
1372                                 continue;
1373                         if (map->stripes[i].physical >= physical_start + len ||
1374                             map->stripes[i].physical + em->orig_block_len <=
1375                             physical_start)
1376                                 continue;
1377                         /*
1378                          * Make sure that while processing the pinned list we do
1379                          * not override our *start with a lower value, because
1380                          * we can have pinned chunks that fall within this
1381                          * device hole and that have lower physical addresses
1382                          * than the pending chunks we processed before. If we
1383                          * do not take this special care we can end up getting
1384                          * 2 pending chunks that start at the same physical
1385                          * device offsets because the end offset of a pinned
1386                          * chunk can be equal to the start offset of some
1387                          * pending chunk.
1388                          */
1389                         end = map->stripes[i].physical + em->orig_block_len;
1390                         if (end > *start) {
1391                                 *start = end;
1392                                 ret = 1;
1393                         }
1394                 }
1395         }
1396         if (search_list != &fs_info->pinned_chunks) {
1397                 search_list = &fs_info->pinned_chunks;
1398                 goto again;
1399         }
1400
1401         return ret;
1402 }
1403
1404
1405 /*
1406  * find_free_dev_extent_start - find free space in the specified device
1407  * @device:       the device which we search the free space in
1408  * @num_bytes:    the size of the free space that we need
1409  * @search_start: the position from which to begin the search
1410  * @start:        store the start of the free space.
1411  * @len:          the size of the free space. that we find, or the size
1412  *                of the max free space if we don't find suitable free space
1413  *
1414  * this uses a pretty simple search, the expectation is that it is
1415  * called very infrequently and that a given device has a small number
1416  * of extents
1417  *
1418  * @start is used to store the start of the free space if we find. But if we
1419  * don't find suitable free space, it will be used to store the start position
1420  * of the max free space.
1421  *
1422  * @len is used to store the size of the free space that we find.
1423  * But if we don't find suitable free space, it is used to store the size of
1424  * the max free space.
1425  */
1426 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1427                                struct btrfs_device *device, u64 num_bytes,
1428                                u64 search_start, u64 *start, u64 *len)
1429 {
1430         struct btrfs_fs_info *fs_info = device->fs_info;
1431         struct btrfs_root *root = fs_info->dev_root;
1432         struct btrfs_key key;
1433         struct btrfs_dev_extent *dev_extent;
1434         struct btrfs_path *path;
1435         u64 hole_size;
1436         u64 max_hole_start;
1437         u64 max_hole_size;
1438         u64 extent_end;
1439         u64 search_end = device->total_bytes;
1440         int ret;
1441         int slot;
1442         struct extent_buffer *l;
1443
1444         /*
1445          * We don't want to overwrite the superblock on the drive nor any area
1446          * used by the boot loader (grub for example), so we make sure to start
1447          * at an offset of at least 1MB.
1448          */
1449         search_start = max_t(u64, search_start, SZ_1M);
1450
1451         path = btrfs_alloc_path();
1452         if (!path)
1453                 return -ENOMEM;
1454
1455         max_hole_start = search_start;
1456         max_hole_size = 0;
1457
1458 again:
1459         if (search_start >= search_end ||
1460                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1461                 ret = -ENOSPC;
1462                 goto out;
1463         }
1464
1465         path->reada = READA_FORWARD;
1466         path->search_commit_root = 1;
1467         path->skip_locking = 1;
1468
1469         key.objectid = device->devid;
1470         key.offset = search_start;
1471         key.type = BTRFS_DEV_EXTENT_KEY;
1472
1473         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1474         if (ret < 0)
1475                 goto out;
1476         if (ret > 0) {
1477                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1478                 if (ret < 0)
1479                         goto out;
1480         }
1481
1482         while (1) {
1483                 l = path->nodes[0];
1484                 slot = path->slots[0];
1485                 if (slot >= btrfs_header_nritems(l)) {
1486                         ret = btrfs_next_leaf(root, path);
1487                         if (ret == 0)
1488                                 continue;
1489                         if (ret < 0)
1490                                 goto out;
1491
1492                         break;
1493                 }
1494                 btrfs_item_key_to_cpu(l, &key, slot);
1495
1496                 if (key.objectid < device->devid)
1497                         goto next;
1498
1499                 if (key.objectid > device->devid)
1500                         break;
1501
1502                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1503                         goto next;
1504
1505                 if (key.offset > search_start) {
1506                         hole_size = key.offset - search_start;
1507
1508                         /*
1509                          * Have to check before we set max_hole_start, otherwise
1510                          * we could end up sending back this offset anyway.
1511                          */
1512                         if (contains_pending_extent(transaction, device,
1513                                                     &search_start,
1514                                                     hole_size)) {
1515                                 if (key.offset >= search_start) {
1516                                         hole_size = key.offset - search_start;
1517                                 } else {
1518                                         WARN_ON_ONCE(1);
1519                                         hole_size = 0;
1520                                 }
1521                         }
1522
1523                         if (hole_size > max_hole_size) {
1524                                 max_hole_start = search_start;
1525                                 max_hole_size = hole_size;
1526                         }
1527
1528                         /*
1529                          * If this free space is greater than which we need,
1530                          * it must be the max free space that we have found
1531                          * until now, so max_hole_start must point to the start
1532                          * of this free space and the length of this free space
1533                          * is stored in max_hole_size. Thus, we return
1534                          * max_hole_start and max_hole_size and go back to the
1535                          * caller.
1536                          */
1537                         if (hole_size >= num_bytes) {
1538                                 ret = 0;
1539                                 goto out;
1540                         }
1541                 }
1542
1543                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1544                 extent_end = key.offset + btrfs_dev_extent_length(l,
1545                                                                   dev_extent);
1546                 if (extent_end > search_start)
1547                         search_start = extent_end;
1548 next:
1549                 path->slots[0]++;
1550                 cond_resched();
1551         }
1552
1553         /*
1554          * At this point, search_start should be the end of
1555          * allocated dev extents, and when shrinking the device,
1556          * search_end may be smaller than search_start.
1557          */
1558         if (search_end > search_start) {
1559                 hole_size = search_end - search_start;
1560
1561                 if (contains_pending_extent(transaction, device, &search_start,
1562                                             hole_size)) {
1563                         btrfs_release_path(path);
1564                         goto again;
1565                 }
1566
1567                 if (hole_size > max_hole_size) {
1568                         max_hole_start = search_start;
1569                         max_hole_size = hole_size;
1570                 }
1571         }
1572
1573         /* See above. */
1574         if (max_hole_size < num_bytes)
1575                 ret = -ENOSPC;
1576         else
1577                 ret = 0;
1578
1579 out:
1580         btrfs_free_path(path);
1581         *start = max_hole_start;
1582         if (len)
1583                 *len = max_hole_size;
1584         return ret;
1585 }
1586
1587 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1588                          struct btrfs_device *device, u64 num_bytes,
1589                          u64 *start, u64 *len)
1590 {
1591         /* FIXME use last free of some kind */
1592         return find_free_dev_extent_start(trans->transaction, device,
1593                                           num_bytes, 0, start, len);
1594 }
1595
1596 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1597                           struct btrfs_device *device,
1598                           u64 start, u64 *dev_extent_len)
1599 {
1600         struct btrfs_fs_info *fs_info = device->fs_info;
1601         struct btrfs_root *root = fs_info->dev_root;
1602         int ret;
1603         struct btrfs_path *path;
1604         struct btrfs_key key;
1605         struct btrfs_key found_key;
1606         struct extent_buffer *leaf = NULL;
1607         struct btrfs_dev_extent *extent = NULL;
1608
1609         path = btrfs_alloc_path();
1610         if (!path)
1611                 return -ENOMEM;
1612
1613         key.objectid = device->devid;
1614         key.offset = start;
1615         key.type = BTRFS_DEV_EXTENT_KEY;
1616 again:
1617         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1618         if (ret > 0) {
1619                 ret = btrfs_previous_item(root, path, key.objectid,
1620                                           BTRFS_DEV_EXTENT_KEY);
1621                 if (ret)
1622                         goto out;
1623                 leaf = path->nodes[0];
1624                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1625                 extent = btrfs_item_ptr(leaf, path->slots[0],
1626                                         struct btrfs_dev_extent);
1627                 BUG_ON(found_key.offset > start || found_key.offset +
1628                        btrfs_dev_extent_length(leaf, extent) < start);
1629                 key = found_key;
1630                 btrfs_release_path(path);
1631                 goto again;
1632         } else if (ret == 0) {
1633                 leaf = path->nodes[0];
1634                 extent = btrfs_item_ptr(leaf, path->slots[0],
1635                                         struct btrfs_dev_extent);
1636         } else {
1637                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1638                 goto out;
1639         }
1640
1641         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1642
1643         ret = btrfs_del_item(trans, root, path);
1644         if (ret) {
1645                 btrfs_handle_fs_error(fs_info, ret,
1646                                       "Failed to remove dev extent item");
1647         } else {
1648                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1649         }
1650 out:
1651         btrfs_free_path(path);
1652         return ret;
1653 }
1654
1655 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1656                                   struct btrfs_device *device,
1657                                   u64 chunk_offset, u64 start, u64 num_bytes)
1658 {
1659         int ret;
1660         struct btrfs_path *path;
1661         struct btrfs_fs_info *fs_info = device->fs_info;
1662         struct btrfs_root *root = fs_info->dev_root;
1663         struct btrfs_dev_extent *extent;
1664         struct extent_buffer *leaf;
1665         struct btrfs_key key;
1666
1667         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1668         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1669         path = btrfs_alloc_path();
1670         if (!path)
1671                 return -ENOMEM;
1672
1673         key.objectid = device->devid;
1674         key.offset = start;
1675         key.type = BTRFS_DEV_EXTENT_KEY;
1676         ret = btrfs_insert_empty_item(trans, root, path, &key,
1677                                       sizeof(*extent));
1678         if (ret)
1679                 goto out;
1680
1681         leaf = path->nodes[0];
1682         extent = btrfs_item_ptr(leaf, path->slots[0],
1683                                 struct btrfs_dev_extent);
1684         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1685                                         BTRFS_CHUNK_TREE_OBJECTID);
1686         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1687                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1688         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1689
1690         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1691         btrfs_mark_buffer_dirty(leaf);
1692 out:
1693         btrfs_free_path(path);
1694         return ret;
1695 }
1696
1697 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1698 {
1699         struct extent_map_tree *em_tree;
1700         struct extent_map *em;
1701         struct rb_node *n;
1702         u64 ret = 0;
1703
1704         em_tree = &fs_info->mapping_tree.map_tree;
1705         read_lock(&em_tree->lock);
1706         n = rb_last(&em_tree->map);
1707         if (n) {
1708                 em = rb_entry(n, struct extent_map, rb_node);
1709                 ret = em->start + em->len;
1710         }
1711         read_unlock(&em_tree->lock);
1712
1713         return ret;
1714 }
1715
1716 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1717                                     u64 *devid_ret)
1718 {
1719         int ret;
1720         struct btrfs_key key;
1721         struct btrfs_key found_key;
1722         struct btrfs_path *path;
1723
1724         path = btrfs_alloc_path();
1725         if (!path)
1726                 return -ENOMEM;
1727
1728         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1729         key.type = BTRFS_DEV_ITEM_KEY;
1730         key.offset = (u64)-1;
1731
1732         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1733         if (ret < 0)
1734                 goto error;
1735
1736         BUG_ON(ret == 0); /* Corruption */
1737
1738         ret = btrfs_previous_item(fs_info->chunk_root, path,
1739                                   BTRFS_DEV_ITEMS_OBJECTID,
1740                                   BTRFS_DEV_ITEM_KEY);
1741         if (ret) {
1742                 *devid_ret = 1;
1743         } else {
1744                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1745                                       path->slots[0]);
1746                 *devid_ret = found_key.offset + 1;
1747         }
1748         ret = 0;
1749 error:
1750         btrfs_free_path(path);
1751         return ret;
1752 }
1753
1754 /*
1755  * the device information is stored in the chunk root
1756  * the btrfs_device struct should be fully filled in
1757  */
1758 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1759                             struct btrfs_fs_info *fs_info,
1760                             struct btrfs_device *device)
1761 {
1762         struct btrfs_root *root = fs_info->chunk_root;
1763         int ret;
1764         struct btrfs_path *path;
1765         struct btrfs_dev_item *dev_item;
1766         struct extent_buffer *leaf;
1767         struct btrfs_key key;
1768         unsigned long ptr;
1769
1770         path = btrfs_alloc_path();
1771         if (!path)
1772                 return -ENOMEM;
1773
1774         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1775         key.type = BTRFS_DEV_ITEM_KEY;
1776         key.offset = device->devid;
1777
1778         ret = btrfs_insert_empty_item(trans, root, path, &key,
1779                                       sizeof(*dev_item));
1780         if (ret)
1781                 goto out;
1782
1783         leaf = path->nodes[0];
1784         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1785
1786         btrfs_set_device_id(leaf, dev_item, device->devid);
1787         btrfs_set_device_generation(leaf, dev_item, 0);
1788         btrfs_set_device_type(leaf, dev_item, device->type);
1789         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1790         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1791         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1792         btrfs_set_device_total_bytes(leaf, dev_item,
1793                                      btrfs_device_get_disk_total_bytes(device));
1794         btrfs_set_device_bytes_used(leaf, dev_item,
1795                                     btrfs_device_get_bytes_used(device));
1796         btrfs_set_device_group(leaf, dev_item, 0);
1797         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1798         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1799         btrfs_set_device_start_offset(leaf, dev_item, 0);
1800
1801         ptr = btrfs_device_uuid(dev_item);
1802         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1803         ptr = btrfs_device_fsid(dev_item);
1804         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1805         btrfs_mark_buffer_dirty(leaf);
1806
1807         ret = 0;
1808 out:
1809         btrfs_free_path(path);
1810         return ret;
1811 }
1812
1813 /*
1814  * Function to update ctime/mtime for a given device path.
1815  * Mainly used for ctime/mtime based probe like libblkid.
1816  */
1817 static void update_dev_time(const char *path_name)
1818 {
1819         struct file *filp;
1820
1821         filp = filp_open(path_name, O_RDWR, 0);
1822         if (IS_ERR(filp))
1823                 return;
1824         file_update_time(filp);
1825         filp_close(filp, NULL);
1826 }
1827
1828 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1829                              struct btrfs_device *device)
1830 {
1831         struct btrfs_root *root = fs_info->chunk_root;
1832         int ret;
1833         struct btrfs_path *path;
1834         struct btrfs_key key;
1835         struct btrfs_trans_handle *trans;
1836
1837         path = btrfs_alloc_path();
1838         if (!path)
1839                 return -ENOMEM;
1840
1841         trans = btrfs_start_transaction(root, 0);
1842         if (IS_ERR(trans)) {
1843                 btrfs_free_path(path);
1844                 return PTR_ERR(trans);
1845         }
1846         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1847         key.type = BTRFS_DEV_ITEM_KEY;
1848         key.offset = device->devid;
1849
1850         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1851         if (ret) {
1852                 if (ret > 0)
1853                         ret = -ENOENT;
1854                 btrfs_abort_transaction(trans, ret);
1855                 btrfs_end_transaction(trans);
1856                 goto out;
1857         }
1858
1859         ret = btrfs_del_item(trans, root, path);
1860         if (ret) {
1861                 btrfs_abort_transaction(trans, ret);
1862                 btrfs_end_transaction(trans);
1863         }
1864
1865 out:
1866         btrfs_free_path(path);
1867         if (!ret)
1868                 ret = btrfs_commit_transaction(trans);
1869         return ret;
1870 }
1871
1872 /*
1873  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1874  * filesystem. It's up to the caller to adjust that number regarding eg. device
1875  * replace.
1876  */
1877 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1878                 u64 num_devices)
1879 {
1880         u64 all_avail;
1881         unsigned seq;
1882         int i;
1883
1884         do {
1885                 seq = read_seqbegin(&fs_info->profiles_lock);
1886
1887                 all_avail = fs_info->avail_data_alloc_bits |
1888                             fs_info->avail_system_alloc_bits |
1889                             fs_info->avail_metadata_alloc_bits;
1890         } while (read_seqretry(&fs_info->profiles_lock, seq));
1891
1892         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1893                 if (!(all_avail & btrfs_raid_group[i]))
1894                         continue;
1895
1896                 if (num_devices < btrfs_raid_array[i].devs_min) {
1897                         int ret = btrfs_raid_mindev_error[i];
1898
1899                         if (ret)
1900                                 return ret;
1901                 }
1902         }
1903
1904         return 0;
1905 }
1906
1907 static struct btrfs_device * btrfs_find_next_active_device(
1908                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1909 {
1910         struct btrfs_device *next_device;
1911
1912         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1913                 if (next_device != device &&
1914                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1915                     && next_device->bdev)
1916                         return next_device;
1917         }
1918
1919         return NULL;
1920 }
1921
1922 /*
1923  * Helper function to check if the given device is part of s_bdev / latest_bdev
1924  * and replace it with the provided or the next active device, in the context
1925  * where this function called, there should be always be another device (or
1926  * this_dev) which is active.
1927  */
1928 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1929                 struct btrfs_device *device, struct btrfs_device *this_dev)
1930 {
1931         struct btrfs_device *next_device;
1932
1933         if (this_dev)
1934                 next_device = this_dev;
1935         else
1936                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1937                                                                 device);
1938         ASSERT(next_device);
1939
1940         if (fs_info->sb->s_bdev &&
1941                         (fs_info->sb->s_bdev == device->bdev))
1942                 fs_info->sb->s_bdev = next_device->bdev;
1943
1944         if (fs_info->fs_devices->latest_bdev == device->bdev)
1945                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1946 }
1947
1948 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1949                 u64 devid)
1950 {
1951         struct btrfs_device *device;
1952         struct btrfs_fs_devices *cur_devices;
1953         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1954         u64 num_devices;
1955         int ret = 0;
1956
1957         mutex_lock(&uuid_mutex);
1958
1959         num_devices = fs_devices->num_devices;
1960         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1961         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1962                 WARN_ON(num_devices < 1);
1963                 num_devices--;
1964         }
1965         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1966
1967         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1968         if (ret)
1969                 goto out;
1970
1971         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1972                                            &device);
1973         if (ret)
1974                 goto out;
1975
1976         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1977                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1978                 goto out;
1979         }
1980
1981         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1982             fs_info->fs_devices->rw_devices == 1) {
1983                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1984                 goto out;
1985         }
1986
1987         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1988                 mutex_lock(&fs_info->chunk_mutex);
1989                 list_del_init(&device->dev_alloc_list);
1990                 device->fs_devices->rw_devices--;
1991                 mutex_unlock(&fs_info->chunk_mutex);
1992         }
1993
1994         mutex_unlock(&uuid_mutex);
1995         ret = btrfs_shrink_device(device, 0);
1996         mutex_lock(&uuid_mutex);
1997         if (ret)
1998                 goto error_undo;
1999
2000         /*
2001          * TODO: the superblock still includes this device in its num_devices
2002          * counter although write_all_supers() is not locked out. This
2003          * could give a filesystem state which requires a degraded mount.
2004          */
2005         ret = btrfs_rm_dev_item(fs_info, device);
2006         if (ret)
2007                 goto error_undo;
2008
2009         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2010         btrfs_scrub_cancel_dev(fs_info, device);
2011
2012         /*
2013          * the device list mutex makes sure that we don't change
2014          * the device list while someone else is writing out all
2015          * the device supers. Whoever is writing all supers, should
2016          * lock the device list mutex before getting the number of
2017          * devices in the super block (super_copy). Conversely,
2018          * whoever updates the number of devices in the super block
2019          * (super_copy) should hold the device list mutex.
2020          */
2021
2022         cur_devices = device->fs_devices;
2023         mutex_lock(&fs_devices->device_list_mutex);
2024         list_del_rcu(&device->dev_list);
2025
2026         device->fs_devices->num_devices--;
2027         device->fs_devices->total_devices--;
2028
2029         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2030                 device->fs_devices->missing_devices--;
2031
2032         btrfs_assign_next_active_device(fs_info, device, NULL);
2033
2034         if (device->bdev) {
2035                 device->fs_devices->open_devices--;
2036                 /* remove sysfs entry */
2037                 btrfs_sysfs_rm_device_link(fs_devices, device);
2038         }
2039
2040         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2041         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2042         mutex_unlock(&fs_devices->device_list_mutex);
2043
2044         /*
2045          * at this point, the device is zero sized and detached from
2046          * the devices list.  All that's left is to zero out the old
2047          * supers and free the device.
2048          */
2049         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2050                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2051
2052         btrfs_close_bdev(device);
2053         call_rcu(&device->rcu, free_device_rcu);
2054
2055         if (cur_devices->open_devices == 0) {
2056                 while (fs_devices) {
2057                         if (fs_devices->seed == cur_devices) {
2058                                 fs_devices->seed = cur_devices->seed;
2059                                 break;
2060                         }
2061                         fs_devices = fs_devices->seed;
2062                 }
2063                 cur_devices->seed = NULL;
2064                 close_fs_devices(cur_devices);
2065                 free_fs_devices(cur_devices);
2066         }
2067
2068 out:
2069         mutex_unlock(&uuid_mutex);
2070         return ret;
2071
2072 error_undo:
2073         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2074                 mutex_lock(&fs_info->chunk_mutex);
2075                 list_add(&device->dev_alloc_list,
2076                          &fs_devices->alloc_list);
2077                 device->fs_devices->rw_devices++;
2078                 mutex_unlock(&fs_info->chunk_mutex);
2079         }
2080         goto out;
2081 }
2082
2083 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
2084                                         struct btrfs_device *srcdev)
2085 {
2086         struct btrfs_fs_devices *fs_devices;
2087
2088         lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2089
2090         /*
2091          * in case of fs with no seed, srcdev->fs_devices will point
2092          * to fs_devices of fs_info. However when the dev being replaced is
2093          * a seed dev it will point to the seed's local fs_devices. In short
2094          * srcdev will have its correct fs_devices in both the cases.
2095          */
2096         fs_devices = srcdev->fs_devices;
2097
2098         list_del_rcu(&srcdev->dev_list);
2099         list_del(&srcdev->dev_alloc_list);
2100         fs_devices->num_devices--;
2101         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2102                 fs_devices->missing_devices--;
2103
2104         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2105                 fs_devices->rw_devices--;
2106
2107         if (srcdev->bdev)
2108                 fs_devices->open_devices--;
2109 }
2110
2111 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2112                                       struct btrfs_device *srcdev)
2113 {
2114         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2115
2116         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2117                 /* zero out the old super if it is writable */
2118                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2119         }
2120
2121         btrfs_close_bdev(srcdev);
2122         call_rcu(&srcdev->rcu, free_device_rcu);
2123
2124         /* if this is no devs we rather delete the fs_devices */
2125         if (!fs_devices->num_devices) {
2126                 struct btrfs_fs_devices *tmp_fs_devices;
2127
2128                 /*
2129                  * On a mounted FS, num_devices can't be zero unless it's a
2130                  * seed. In case of a seed device being replaced, the replace
2131                  * target added to the sprout FS, so there will be no more
2132                  * device left under the seed FS.
2133                  */
2134                 ASSERT(fs_devices->seeding);
2135
2136                 tmp_fs_devices = fs_info->fs_devices;
2137                 while (tmp_fs_devices) {
2138                         if (tmp_fs_devices->seed == fs_devices) {
2139                                 tmp_fs_devices->seed = fs_devices->seed;
2140                                 break;
2141                         }
2142                         tmp_fs_devices = tmp_fs_devices->seed;
2143                 }
2144                 fs_devices->seed = NULL;
2145                 close_fs_devices(fs_devices);
2146                 free_fs_devices(fs_devices);
2147         }
2148 }
2149
2150 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2151                                       struct btrfs_device *tgtdev)
2152 {
2153         mutex_lock(&uuid_mutex);
2154         WARN_ON(!tgtdev);
2155         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2156
2157         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2158
2159         if (tgtdev->bdev)
2160                 fs_info->fs_devices->open_devices--;
2161
2162         fs_info->fs_devices->num_devices--;
2163
2164         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2165
2166         list_del_rcu(&tgtdev->dev_list);
2167
2168         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2169         mutex_unlock(&uuid_mutex);
2170
2171         /*
2172          * The update_dev_time() with in btrfs_scratch_superblocks()
2173          * may lead to a call to btrfs_show_devname() which will try
2174          * to hold device_list_mutex. And here this device
2175          * is already out of device list, so we don't have to hold
2176          * the device_list_mutex lock.
2177          */
2178         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2179
2180         btrfs_close_bdev(tgtdev);
2181         call_rcu(&tgtdev->rcu, free_device_rcu);
2182 }
2183
2184 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2185                                      const char *device_path,
2186                                      struct btrfs_device **device)
2187 {
2188         int ret = 0;
2189         struct btrfs_super_block *disk_super;
2190         u64 devid;
2191         u8 *dev_uuid;
2192         struct block_device *bdev;
2193         struct buffer_head *bh;
2194
2195         *device = NULL;
2196         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2197                                     fs_info->bdev_holder, 0, &bdev, &bh);
2198         if (ret)
2199                 return ret;
2200         disk_super = (struct btrfs_super_block *)bh->b_data;
2201         devid = btrfs_stack_device_id(&disk_super->dev_item);
2202         dev_uuid = disk_super->dev_item.uuid;
2203         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2204         brelse(bh);
2205         if (!*device)
2206                 ret = -ENOENT;
2207         blkdev_put(bdev, FMODE_READ);
2208         return ret;
2209 }
2210
2211 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2212                                          const char *device_path,
2213                                          struct btrfs_device **device)
2214 {
2215         *device = NULL;
2216         if (strcmp(device_path, "missing") == 0) {
2217                 struct list_head *devices;
2218                 struct btrfs_device *tmp;
2219
2220                 devices = &fs_info->fs_devices->devices;
2221                 list_for_each_entry(tmp, devices, dev_list) {
2222                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2223                                         &tmp->dev_state) && !tmp->bdev) {
2224                                 *device = tmp;
2225                                 break;
2226                         }
2227                 }
2228
2229                 if (!*device)
2230                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2231
2232                 return 0;
2233         } else {
2234                 return btrfs_find_device_by_path(fs_info, device_path, device);
2235         }
2236 }
2237
2238 /*
2239  * Lookup a device given by device id, or the path if the id is 0.
2240  */
2241 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2242                                  const char *devpath,
2243                                  struct btrfs_device **device)
2244 {
2245         int ret;
2246
2247         if (devid) {
2248                 ret = 0;
2249                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2250                 if (!*device)
2251                         ret = -ENOENT;
2252         } else {
2253                 if (!devpath || !devpath[0])
2254                         return -EINVAL;
2255
2256                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2257                                                            device);
2258         }
2259         return ret;
2260 }
2261
2262 /*
2263  * does all the dirty work required for changing file system's UUID.
2264  */
2265 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2266 {
2267         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2268         struct btrfs_fs_devices *old_devices;
2269         struct btrfs_fs_devices *seed_devices;
2270         struct btrfs_super_block *disk_super = fs_info->super_copy;
2271         struct btrfs_device *device;
2272         u64 super_flags;
2273
2274         lockdep_assert_held(&uuid_mutex);
2275         if (!fs_devices->seeding)
2276                 return -EINVAL;
2277
2278         seed_devices = alloc_fs_devices(NULL);
2279         if (IS_ERR(seed_devices))
2280                 return PTR_ERR(seed_devices);
2281
2282         old_devices = clone_fs_devices(fs_devices);
2283         if (IS_ERR(old_devices)) {
2284                 kfree(seed_devices);
2285                 return PTR_ERR(old_devices);
2286         }
2287
2288         list_add(&old_devices->fs_list, &fs_uuids);
2289
2290         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2291         seed_devices->opened = 1;
2292         INIT_LIST_HEAD(&seed_devices->devices);
2293         INIT_LIST_HEAD(&seed_devices->alloc_list);
2294         mutex_init(&seed_devices->device_list_mutex);
2295
2296         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2297         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2298                               synchronize_rcu);
2299         list_for_each_entry(device, &seed_devices->devices, dev_list)
2300                 device->fs_devices = seed_devices;
2301
2302         mutex_lock(&fs_info->chunk_mutex);
2303         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2304         mutex_unlock(&fs_info->chunk_mutex);
2305
2306         fs_devices->seeding = 0;
2307         fs_devices->num_devices = 0;
2308         fs_devices->open_devices = 0;
2309         fs_devices->missing_devices = 0;
2310         fs_devices->rotating = 0;
2311         fs_devices->seed = seed_devices;
2312
2313         generate_random_uuid(fs_devices->fsid);
2314         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2315         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2316         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2317
2318         super_flags = btrfs_super_flags(disk_super) &
2319                       ~BTRFS_SUPER_FLAG_SEEDING;
2320         btrfs_set_super_flags(disk_super, super_flags);
2321
2322         return 0;
2323 }
2324
2325 /*
2326  * Store the expected generation for seed devices in device items.
2327  */
2328 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2329                                struct btrfs_fs_info *fs_info)
2330 {
2331         struct btrfs_root *root = fs_info->chunk_root;
2332         struct btrfs_path *path;
2333         struct extent_buffer *leaf;
2334         struct btrfs_dev_item *dev_item;
2335         struct btrfs_device *device;
2336         struct btrfs_key key;
2337         u8 fs_uuid[BTRFS_FSID_SIZE];
2338         u8 dev_uuid[BTRFS_UUID_SIZE];
2339         u64 devid;
2340         int ret;
2341
2342         path = btrfs_alloc_path();
2343         if (!path)
2344                 return -ENOMEM;
2345
2346         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2347         key.offset = 0;
2348         key.type = BTRFS_DEV_ITEM_KEY;
2349
2350         while (1) {
2351                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2352                 if (ret < 0)
2353                         goto error;
2354
2355                 leaf = path->nodes[0];
2356 next_slot:
2357                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2358                         ret = btrfs_next_leaf(root, path);
2359                         if (ret > 0)
2360                                 break;
2361                         if (ret < 0)
2362                                 goto error;
2363                         leaf = path->nodes[0];
2364                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2365                         btrfs_release_path(path);
2366                         continue;
2367                 }
2368
2369                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2370                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2371                     key.type != BTRFS_DEV_ITEM_KEY)
2372                         break;
2373
2374                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2375                                           struct btrfs_dev_item);
2376                 devid = btrfs_device_id(leaf, dev_item);
2377                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2378                                    BTRFS_UUID_SIZE);
2379                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2380                                    BTRFS_FSID_SIZE);
2381                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2382                 BUG_ON(!device); /* Logic error */
2383
2384                 if (device->fs_devices->seeding) {
2385                         btrfs_set_device_generation(leaf, dev_item,
2386                                                     device->generation);
2387                         btrfs_mark_buffer_dirty(leaf);
2388                 }
2389
2390                 path->slots[0]++;
2391                 goto next_slot;
2392         }
2393         ret = 0;
2394 error:
2395         btrfs_free_path(path);
2396         return ret;
2397 }
2398
2399 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2400 {
2401         struct btrfs_root *root = fs_info->dev_root;
2402         struct request_queue *q;
2403         struct btrfs_trans_handle *trans;
2404         struct btrfs_device *device;
2405         struct block_device *bdev;
2406         struct list_head *devices;
2407         struct super_block *sb = fs_info->sb;
2408         struct rcu_string *name;
2409         u64 tmp;
2410         int seeding_dev = 0;
2411         int ret = 0;
2412         bool unlocked = false;
2413
2414         if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2415                 return -EROFS;
2416
2417         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2418                                   fs_info->bdev_holder);
2419         if (IS_ERR(bdev))
2420                 return PTR_ERR(bdev);
2421
2422         if (fs_info->fs_devices->seeding) {
2423                 seeding_dev = 1;
2424                 down_write(&sb->s_umount);
2425                 mutex_lock(&uuid_mutex);
2426         }
2427
2428         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2429
2430         devices = &fs_info->fs_devices->devices;
2431
2432         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2433         list_for_each_entry(device, devices, dev_list) {
2434                 if (device->bdev == bdev) {
2435                         ret = -EEXIST;
2436                         mutex_unlock(
2437                                 &fs_info->fs_devices->device_list_mutex);
2438                         goto error;
2439                 }
2440         }
2441         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2442
2443         device = btrfs_alloc_device(fs_info, NULL, NULL);
2444         if (IS_ERR(device)) {
2445                 /* we can safely leave the fs_devices entry around */
2446                 ret = PTR_ERR(device);
2447                 goto error;
2448         }
2449
2450         name = rcu_string_strdup(device_path, GFP_KERNEL);
2451         if (!name) {
2452                 ret = -ENOMEM;
2453                 goto error_free_device;
2454         }
2455         rcu_assign_pointer(device->name, name);
2456
2457         trans = btrfs_start_transaction(root, 0);
2458         if (IS_ERR(trans)) {
2459                 ret = PTR_ERR(trans);
2460                 goto error_free_device;
2461         }
2462
2463         q = bdev_get_queue(bdev);
2464         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2465         device->generation = trans->transid;
2466         device->io_width = fs_info->sectorsize;
2467         device->io_align = fs_info->sectorsize;
2468         device->sector_size = fs_info->sectorsize;
2469         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2470                                          fs_info->sectorsize);
2471         device->disk_total_bytes = device->total_bytes;
2472         device->commit_total_bytes = device->total_bytes;
2473         device->fs_info = fs_info;
2474         device->bdev = bdev;
2475         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2476         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2477         device->mode = FMODE_EXCL;
2478         device->dev_stats_valid = 1;
2479         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2480
2481         if (seeding_dev) {
2482                 sb->s_flags &= ~SB_RDONLY;
2483                 ret = btrfs_prepare_sprout(fs_info);
2484                 if (ret) {
2485                         btrfs_abort_transaction(trans, ret);
2486                         goto error_trans;
2487                 }
2488         }
2489
2490         device->fs_devices = fs_info->fs_devices;
2491
2492         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2493         mutex_lock(&fs_info->chunk_mutex);
2494         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2495         list_add(&device->dev_alloc_list,
2496                  &fs_info->fs_devices->alloc_list);
2497         fs_info->fs_devices->num_devices++;
2498         fs_info->fs_devices->open_devices++;
2499         fs_info->fs_devices->rw_devices++;
2500         fs_info->fs_devices->total_devices++;
2501         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2502
2503         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2504
2505         if (!blk_queue_nonrot(q))
2506                 fs_info->fs_devices->rotating = 1;
2507
2508         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2509         btrfs_set_super_total_bytes(fs_info->super_copy,
2510                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2511
2512         tmp = btrfs_super_num_devices(fs_info->super_copy);
2513         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2514
2515         /* add sysfs device entry */
2516         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2517
2518         /*
2519          * we've got more storage, clear any full flags on the space
2520          * infos
2521          */
2522         btrfs_clear_space_info_full(fs_info);
2523
2524         mutex_unlock(&fs_info->chunk_mutex);
2525         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2526
2527         if (seeding_dev) {
2528                 mutex_lock(&fs_info->chunk_mutex);
2529                 ret = init_first_rw_device(trans, fs_info);
2530                 mutex_unlock(&fs_info->chunk_mutex);
2531                 if (ret) {
2532                         btrfs_abort_transaction(trans, ret);
2533                         goto error_sysfs;
2534                 }
2535         }
2536
2537         ret = btrfs_add_dev_item(trans, fs_info, device);
2538         if (ret) {
2539                 btrfs_abort_transaction(trans, ret);
2540                 goto error_sysfs;
2541         }
2542
2543         if (seeding_dev) {
2544                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2545
2546                 ret = btrfs_finish_sprout(trans, fs_info);
2547                 if (ret) {
2548                         btrfs_abort_transaction(trans, ret);
2549                         goto error_sysfs;
2550                 }
2551
2552                 /* Sprouting would change fsid of the mounted root,
2553                  * so rename the fsid on the sysfs
2554                  */
2555                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2556                                                 fs_info->fsid);
2557                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2558                         btrfs_warn(fs_info,
2559                                    "sysfs: failed to create fsid for sprout");
2560         }
2561
2562         ret = btrfs_commit_transaction(trans);
2563
2564         if (seeding_dev) {
2565                 mutex_unlock(&uuid_mutex);
2566                 up_write(&sb->s_umount);
2567                 unlocked = true;
2568
2569                 if (ret) /* transaction commit */
2570                         return ret;
2571
2572                 ret = btrfs_relocate_sys_chunks(fs_info);
2573                 if (ret < 0)
2574                         btrfs_handle_fs_error(fs_info, ret,
2575                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2576                 trans = btrfs_attach_transaction(root);
2577                 if (IS_ERR(trans)) {
2578                         if (PTR_ERR(trans) == -ENOENT)
2579                                 return 0;
2580                         ret = PTR_ERR(trans);
2581                         trans = NULL;
2582                         goto error_sysfs;
2583                 }
2584                 ret = btrfs_commit_transaction(trans);
2585         }
2586
2587         /* Update ctime/mtime for libblkid */
2588         update_dev_time(device_path);
2589         return ret;
2590
2591 error_sysfs:
2592         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2593 error_trans:
2594         if (seeding_dev)
2595                 sb->s_flags |= SB_RDONLY;
2596         if (trans)
2597                 btrfs_end_transaction(trans);
2598 error_free_device:
2599         btrfs_free_device(device);
2600 error:
2601         blkdev_put(bdev, FMODE_EXCL);
2602         if (seeding_dev && !unlocked) {
2603                 mutex_unlock(&uuid_mutex);
2604                 up_write(&sb->s_umount);
2605         }
2606         return ret;
2607 }
2608
2609 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2610                                         struct btrfs_device *device)
2611 {
2612         int ret;
2613         struct btrfs_path *path;
2614         struct btrfs_root *root = device->fs_info->chunk_root;
2615         struct btrfs_dev_item *dev_item;
2616         struct extent_buffer *leaf;
2617         struct btrfs_key key;
2618
2619         path = btrfs_alloc_path();
2620         if (!path)
2621                 return -ENOMEM;
2622
2623         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2624         key.type = BTRFS_DEV_ITEM_KEY;
2625         key.offset = device->devid;
2626
2627         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2628         if (ret < 0)
2629                 goto out;
2630
2631         if (ret > 0) {
2632                 ret = -ENOENT;
2633                 goto out;
2634         }
2635
2636         leaf = path->nodes[0];
2637         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2638
2639         btrfs_set_device_id(leaf, dev_item, device->devid);
2640         btrfs_set_device_type(leaf, dev_item, device->type);
2641         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2642         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2643         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2644         btrfs_set_device_total_bytes(leaf, dev_item,
2645                                      btrfs_device_get_disk_total_bytes(device));
2646         btrfs_set_device_bytes_used(leaf, dev_item,
2647                                     btrfs_device_get_bytes_used(device));
2648         btrfs_mark_buffer_dirty(leaf);
2649
2650 out:
2651         btrfs_free_path(path);
2652         return ret;
2653 }
2654
2655 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2656                       struct btrfs_device *device, u64 new_size)
2657 {
2658         struct btrfs_fs_info *fs_info = device->fs_info;
2659         struct btrfs_super_block *super_copy = fs_info->super_copy;
2660         struct btrfs_fs_devices *fs_devices;
2661         u64 old_total;
2662         u64 diff;
2663
2664         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2665                 return -EACCES;
2666
2667         new_size = round_down(new_size, fs_info->sectorsize);
2668
2669         mutex_lock(&fs_info->chunk_mutex);
2670         old_total = btrfs_super_total_bytes(super_copy);
2671         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2672
2673         if (new_size <= device->total_bytes ||
2674             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2675                 mutex_unlock(&fs_info->chunk_mutex);
2676                 return -EINVAL;
2677         }
2678
2679         fs_devices = fs_info->fs_devices;
2680
2681         btrfs_set_super_total_bytes(super_copy,
2682                         round_down(old_total + diff, fs_info->sectorsize));
2683         device->fs_devices->total_rw_bytes += diff;
2684
2685         btrfs_device_set_total_bytes(device, new_size);
2686         btrfs_device_set_disk_total_bytes(device, new_size);
2687         btrfs_clear_space_info_full(device->fs_info);
2688         if (list_empty(&device->resized_list))
2689                 list_add_tail(&device->resized_list,
2690                               &fs_devices->resized_devices);
2691         mutex_unlock(&fs_info->chunk_mutex);
2692
2693         return btrfs_update_device(trans, device);
2694 }
2695
2696 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2697                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2698 {
2699         struct btrfs_root *root = fs_info->chunk_root;
2700         int ret;
2701         struct btrfs_path *path;
2702         struct btrfs_key key;
2703
2704         path = btrfs_alloc_path();
2705         if (!path)
2706                 return -ENOMEM;
2707
2708         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2709         key.offset = chunk_offset;
2710         key.type = BTRFS_CHUNK_ITEM_KEY;
2711
2712         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2713         if (ret < 0)
2714                 goto out;
2715         else if (ret > 0) { /* Logic error or corruption */
2716                 btrfs_handle_fs_error(fs_info, -ENOENT,
2717                                       "Failed lookup while freeing chunk.");
2718                 ret = -ENOENT;
2719                 goto out;
2720         }
2721
2722         ret = btrfs_del_item(trans, root, path);
2723         if (ret < 0)
2724                 btrfs_handle_fs_error(fs_info, ret,
2725                                       "Failed to delete chunk item.");
2726 out:
2727         btrfs_free_path(path);
2728         return ret;
2729 }
2730
2731 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2732 {
2733         struct btrfs_super_block *super_copy = fs_info->super_copy;
2734         struct btrfs_disk_key *disk_key;
2735         struct btrfs_chunk *chunk;
2736         u8 *ptr;
2737         int ret = 0;
2738         u32 num_stripes;
2739         u32 array_size;
2740         u32 len = 0;
2741         u32 cur;
2742         struct btrfs_key key;
2743
2744         mutex_lock(&fs_info->chunk_mutex);
2745         array_size = btrfs_super_sys_array_size(super_copy);
2746
2747         ptr = super_copy->sys_chunk_array;
2748         cur = 0;
2749
2750         while (cur < array_size) {
2751                 disk_key = (struct btrfs_disk_key *)ptr;
2752                 btrfs_disk_key_to_cpu(&key, disk_key);
2753
2754                 len = sizeof(*disk_key);
2755
2756                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2757                         chunk = (struct btrfs_chunk *)(ptr + len);
2758                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2759                         len += btrfs_chunk_item_size(num_stripes);
2760                 } else {
2761                         ret = -EIO;
2762                         break;
2763                 }
2764                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2765                     key.offset == chunk_offset) {
2766                         memmove(ptr, ptr + len, array_size - (cur + len));
2767                         array_size -= len;
2768                         btrfs_set_super_sys_array_size(super_copy, array_size);
2769                 } else {
2770                         ptr += len;
2771                         cur += len;
2772                 }
2773         }
2774         mutex_unlock(&fs_info->chunk_mutex);
2775         return ret;
2776 }
2777
2778 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2779                                         u64 logical, u64 length)
2780 {
2781         struct extent_map_tree *em_tree;
2782         struct extent_map *em;
2783
2784         em_tree = &fs_info->mapping_tree.map_tree;
2785         read_lock(&em_tree->lock);
2786         em = lookup_extent_mapping(em_tree, logical, length);
2787         read_unlock(&em_tree->lock);
2788
2789         if (!em) {
2790                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2791                            logical, length);
2792                 return ERR_PTR(-EINVAL);
2793         }
2794
2795         if (em->start > logical || em->start + em->len < logical) {
2796                 btrfs_crit(fs_info,
2797                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2798                            logical, length, em->start, em->start + em->len);
2799                 free_extent_map(em);
2800                 return ERR_PTR(-EINVAL);
2801         }
2802
2803         /* callers are responsible for dropping em's ref. */
2804         return em;
2805 }
2806
2807 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2808                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2809 {
2810         struct extent_map *em;
2811         struct map_lookup *map;
2812         u64 dev_extent_len = 0;
2813         int i, ret = 0;
2814         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2815
2816         em = get_chunk_map(fs_info, chunk_offset, 1);
2817         if (IS_ERR(em)) {
2818                 /*
2819                  * This is a logic error, but we don't want to just rely on the
2820                  * user having built with ASSERT enabled, so if ASSERT doesn't
2821                  * do anything we still error out.
2822                  */
2823                 ASSERT(0);
2824                 return PTR_ERR(em);
2825         }
2826         map = em->map_lookup;
2827         mutex_lock(&fs_info->chunk_mutex);
2828         check_system_chunk(trans, fs_info, map->type);
2829         mutex_unlock(&fs_info->chunk_mutex);
2830
2831         /*
2832          * Take the device list mutex to prevent races with the final phase of
2833          * a device replace operation that replaces the device object associated
2834          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2835          */
2836         mutex_lock(&fs_devices->device_list_mutex);
2837         for (i = 0; i < map->num_stripes; i++) {
2838                 struct btrfs_device *device = map->stripes[i].dev;
2839                 ret = btrfs_free_dev_extent(trans, device,
2840                                             map->stripes[i].physical,
2841                                             &dev_extent_len);
2842                 if (ret) {
2843                         mutex_unlock(&fs_devices->device_list_mutex);
2844                         btrfs_abort_transaction(trans, ret);
2845                         goto out;
2846                 }
2847
2848                 if (device->bytes_used > 0) {
2849                         mutex_lock(&fs_info->chunk_mutex);
2850                         btrfs_device_set_bytes_used(device,
2851                                         device->bytes_used - dev_extent_len);
2852                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2853                         btrfs_clear_space_info_full(fs_info);
2854                         mutex_unlock(&fs_info->chunk_mutex);
2855                 }
2856
2857                 if (map->stripes[i].dev) {
2858                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2859                         if (ret) {
2860                                 mutex_unlock(&fs_devices->device_list_mutex);
2861                                 btrfs_abort_transaction(trans, ret);
2862                                 goto out;
2863                         }
2864                 }
2865         }
2866         mutex_unlock(&fs_devices->device_list_mutex);
2867
2868         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2869         if (ret) {
2870                 btrfs_abort_transaction(trans, ret);
2871                 goto out;
2872         }
2873
2874         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2875
2876         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2877                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2878                 if (ret) {
2879                         btrfs_abort_transaction(trans, ret);
2880                         goto out;
2881                 }
2882         }
2883
2884         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2885         if (ret) {
2886                 btrfs_abort_transaction(trans, ret);
2887                 goto out;
2888         }
2889
2890 out:
2891         /* once for us */
2892         free_extent_map(em);
2893         return ret;
2894 }
2895
2896 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2897 {
2898         struct btrfs_root *root = fs_info->chunk_root;
2899         struct btrfs_trans_handle *trans;
2900         int ret;
2901
2902         /*
2903          * Prevent races with automatic removal of unused block groups.
2904          * After we relocate and before we remove the chunk with offset
2905          * chunk_offset, automatic removal of the block group can kick in,
2906          * resulting in a failure when calling btrfs_remove_chunk() below.
2907          *
2908          * Make sure to acquire this mutex before doing a tree search (dev
2909          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2910          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2911          * we release the path used to search the chunk/dev tree and before
2912          * the current task acquires this mutex and calls us.
2913          */
2914         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2915
2916         ret = btrfs_can_relocate(fs_info, chunk_offset);
2917         if (ret)
2918                 return -ENOSPC;
2919
2920         /* step one, relocate all the extents inside this chunk */
2921         btrfs_scrub_pause(fs_info);
2922         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2923         btrfs_scrub_continue(fs_info);
2924         if (ret)
2925                 return ret;
2926
2927         /*
2928          * We add the kobjects here (and after forcing data chunk creation)
2929          * since relocation is the only place we'll create chunks of a new
2930          * type at runtime.  The only place where we'll remove the last
2931          * chunk of a type is the call immediately below this one.  Even
2932          * so, we're protected against races with the cleaner thread since
2933          * we're covered by the delete_unused_bgs_mutex.
2934          */
2935         btrfs_add_raid_kobjects(fs_info);
2936
2937         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2938                                                      chunk_offset);
2939         if (IS_ERR(trans)) {
2940                 ret = PTR_ERR(trans);
2941                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2942                 return ret;
2943         }
2944
2945         /*
2946          * step two, delete the device extents and the
2947          * chunk tree entries
2948          */
2949         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2950         btrfs_end_transaction(trans);
2951         return ret;
2952 }
2953
2954 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2955 {
2956         struct btrfs_root *chunk_root = fs_info->chunk_root;
2957         struct btrfs_path *path;
2958         struct extent_buffer *leaf;
2959         struct btrfs_chunk *chunk;
2960         struct btrfs_key key;
2961         struct btrfs_key found_key;
2962         u64 chunk_type;
2963         bool retried = false;
2964         int failed = 0;
2965         int ret;
2966
2967         path = btrfs_alloc_path();
2968         if (!path)
2969                 return -ENOMEM;
2970
2971 again:
2972         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2973         key.offset = (u64)-1;
2974         key.type = BTRFS_CHUNK_ITEM_KEY;
2975
2976         while (1) {
2977                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2978                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2979                 if (ret < 0) {
2980                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2981                         goto error;
2982                 }
2983                 BUG_ON(ret == 0); /* Corruption */
2984
2985                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2986                                           key.type);
2987                 if (ret)
2988                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2989                 if (ret < 0)
2990                         goto error;
2991                 if (ret > 0)
2992                         break;
2993
2994                 leaf = path->nodes[0];
2995                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2996
2997                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2998                                        struct btrfs_chunk);
2999                 chunk_type = btrfs_chunk_type(leaf, chunk);
3000                 btrfs_release_path(path);
3001
3002                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3003                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3004                         if (ret == -ENOSPC)
3005                                 failed++;
3006                         else
3007                                 BUG_ON(ret);
3008                 }
3009                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3010
3011                 if (found_key.offset == 0)
3012                         break;
3013                 key.offset = found_key.offset - 1;
3014         }
3015         ret = 0;
3016         if (failed && !retried) {
3017                 failed = 0;
3018                 retried = true;
3019                 goto again;
3020         } else if (WARN_ON(failed && retried)) {
3021                 ret = -ENOSPC;
3022         }
3023 error:
3024         btrfs_free_path(path);
3025         return ret;
3026 }
3027
3028 /*
3029  * return 1 : allocate a data chunk successfully,
3030  * return <0: errors during allocating a data chunk,
3031  * return 0 : no need to allocate a data chunk.
3032  */
3033 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3034                                       u64 chunk_offset)
3035 {
3036         struct btrfs_block_group_cache *cache;
3037         u64 bytes_used;
3038         u64 chunk_type;
3039
3040         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3041         ASSERT(cache);
3042         chunk_type = cache->flags;
3043         btrfs_put_block_group(cache);
3044
3045         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3046                 spin_lock(&fs_info->data_sinfo->lock);
3047                 bytes_used = fs_info->data_sinfo->bytes_used;
3048                 spin_unlock(&fs_info->data_sinfo->lock);
3049
3050                 if (!bytes_used) {
3051                         struct btrfs_trans_handle *trans;
3052                         int ret;
3053
3054                         trans = btrfs_join_transaction(fs_info->tree_root);
3055                         if (IS_ERR(trans))
3056                                 return PTR_ERR(trans);
3057
3058                         ret = btrfs_force_chunk_alloc(trans, fs_info,
3059                                                       BTRFS_BLOCK_GROUP_DATA);
3060                         btrfs_end_transaction(trans);
3061                         if (ret < 0)
3062                                 return ret;
3063
3064                         btrfs_add_raid_kobjects(fs_info);
3065
3066                         return 1;
3067                 }
3068         }
3069         return 0;
3070 }
3071
3072 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3073                                struct btrfs_balance_control *bctl)
3074 {
3075         struct btrfs_root *root = fs_info->tree_root;
3076         struct btrfs_trans_handle *trans;
3077         struct btrfs_balance_item *item;
3078         struct btrfs_disk_balance_args disk_bargs;
3079         struct btrfs_path *path;
3080         struct extent_buffer *leaf;
3081         struct btrfs_key key;
3082         int ret, err;
3083
3084         path = btrfs_alloc_path();
3085         if (!path)
3086                 return -ENOMEM;
3087
3088         trans = btrfs_start_transaction(root, 0);
3089         if (IS_ERR(trans)) {
3090                 btrfs_free_path(path);
3091                 return PTR_ERR(trans);
3092         }
3093
3094         key.objectid = BTRFS_BALANCE_OBJECTID;
3095         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3096         key.offset = 0;
3097
3098         ret = btrfs_insert_empty_item(trans, root, path, &key,
3099                                       sizeof(*item));
3100         if (ret)
3101                 goto out;
3102
3103         leaf = path->nodes[0];
3104         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3105
3106         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3107
3108         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3109         btrfs_set_balance_data(leaf, item, &disk_bargs);
3110         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3111         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3112         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3113         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3114
3115         btrfs_set_balance_flags(leaf, item, bctl->flags);
3116
3117         btrfs_mark_buffer_dirty(leaf);
3118 out:
3119         btrfs_free_path(path);
3120         err = btrfs_commit_transaction(trans);
3121         if (err && !ret)
3122                 ret = err;
3123         return ret;
3124 }
3125
3126 static int del_balance_item(struct btrfs_fs_info *fs_info)
3127 {
3128         struct btrfs_root *root = fs_info->tree_root;
3129         struct btrfs_trans_handle *trans;
3130         struct btrfs_path *path;
3131         struct btrfs_key key;
3132         int ret, err;
3133
3134         path = btrfs_alloc_path();
3135         if (!path)
3136                 return -ENOMEM;
3137
3138         trans = btrfs_start_transaction(root, 0);
3139         if (IS_ERR(trans)) {
3140                 btrfs_free_path(path);
3141                 return PTR_ERR(trans);
3142         }
3143
3144         key.objectid = BTRFS_BALANCE_OBJECTID;
3145         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3146         key.offset = 0;
3147
3148         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3149         if (ret < 0)
3150                 goto out;
3151         if (ret > 0) {
3152                 ret = -ENOENT;
3153                 goto out;
3154         }
3155
3156         ret = btrfs_del_item(trans, root, path);
3157 out:
3158         btrfs_free_path(path);
3159         err = btrfs_commit_transaction(trans);
3160         if (err && !ret)
3161                 ret = err;
3162         return ret;
3163 }
3164
3165 /*
3166  * This is a heuristic used to reduce the number of chunks balanced on
3167  * resume after balance was interrupted.
3168  */
3169 static void update_balance_args(struct btrfs_balance_control *bctl)
3170 {
3171         /*
3172          * Turn on soft mode for chunk types that were being converted.
3173          */
3174         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3175                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3176         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3177                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3178         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3179                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3180
3181         /*
3182          * Turn on usage filter if is not already used.  The idea is
3183          * that chunks that we have already balanced should be
3184          * reasonably full.  Don't do it for chunks that are being
3185          * converted - that will keep us from relocating unconverted
3186          * (albeit full) chunks.
3187          */
3188         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3189             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3190             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3191                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3192                 bctl->data.usage = 90;
3193         }
3194         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3195             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3196             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3197                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3198                 bctl->sys.usage = 90;
3199         }
3200         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3201             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3202             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3203                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3204                 bctl->meta.usage = 90;
3205         }
3206 }
3207
3208 /*
3209  * Should be called with both balance and volume mutexes held to
3210  * serialize other volume operations (add_dev/rm_dev/resize) with
3211  * restriper.  Same goes for reset_balance_state.
3212  */
3213 static void set_balance_control(struct btrfs_balance_control *bctl)
3214 {
3215         struct btrfs_fs_info *fs_info = bctl->fs_info;
3216
3217         BUG_ON(fs_info->balance_ctl);
3218
3219         spin_lock(&fs_info->balance_lock);
3220         fs_info->balance_ctl = bctl;
3221         spin_unlock(&fs_info->balance_lock);
3222 }
3223
3224 /*
3225  * Clear the balance status in fs_info and delete the balance item from disk.
3226  */
3227 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3228 {
3229         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3230         int ret;
3231
3232         BUG_ON(!fs_info->balance_ctl);
3233
3234         spin_lock(&fs_info->balance_lock);
3235         fs_info->balance_ctl = NULL;
3236         spin_unlock(&fs_info->balance_lock);
3237
3238         kfree(bctl);
3239         ret = del_balance_item(fs_info);
3240         if (ret)
3241                 btrfs_handle_fs_error(fs_info, ret, NULL);
3242 }
3243
3244 /*
3245  * Balance filters.  Return 1 if chunk should be filtered out
3246  * (should not be balanced).
3247  */
3248 static int chunk_profiles_filter(u64 chunk_type,
3249                                  struct btrfs_balance_args *bargs)
3250 {