btrfs: cleanup helpers that reset balance state
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/iocontext.h>
12 #include <linux/capability.h>
13 #include <linux/ratelimit.h>
14 #include <linux/kthread.h>
15 #include <linux/raid/pq.h>
16 #include <linux/semaphore.h>
17 #include <linux/uuid.h>
18 #include <linux/list_sort.h>
19 #include <asm/div64.h>
20 #include "ctree.h"
21 #include "extent_map.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "volumes.h"
26 #include "raid56.h"
27 #include "async-thread.h"
28 #include "check-integrity.h"
29 #include "rcu-string.h"
30 #include "math.h"
31 #include "dev-replace.h"
32 #include "sysfs.h"
33
34 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
35         [BTRFS_RAID_RAID10] = {
36                 .sub_stripes    = 2,
37                 .dev_stripes    = 1,
38                 .devs_max       = 0,    /* 0 == as many as possible */
39                 .devs_min       = 4,
40                 .tolerated_failures = 1,
41                 .devs_increment = 2,
42                 .ncopies        = 2,
43         },
44         [BTRFS_RAID_RAID1] = {
45                 .sub_stripes    = 1,
46                 .dev_stripes    = 1,
47                 .devs_max       = 2,
48                 .devs_min       = 2,
49                 .tolerated_failures = 1,
50                 .devs_increment = 2,
51                 .ncopies        = 2,
52         },
53         [BTRFS_RAID_DUP] = {
54                 .sub_stripes    = 1,
55                 .dev_stripes    = 2,
56                 .devs_max       = 1,
57                 .devs_min       = 1,
58                 .tolerated_failures = 0,
59                 .devs_increment = 1,
60                 .ncopies        = 2,
61         },
62         [BTRFS_RAID_RAID0] = {
63                 .sub_stripes    = 1,
64                 .dev_stripes    = 1,
65                 .devs_max       = 0,
66                 .devs_min       = 2,
67                 .tolerated_failures = 0,
68                 .devs_increment = 1,
69                 .ncopies        = 1,
70         },
71         [BTRFS_RAID_SINGLE] = {
72                 .sub_stripes    = 1,
73                 .dev_stripes    = 1,
74                 .devs_max       = 1,
75                 .devs_min       = 1,
76                 .tolerated_failures = 0,
77                 .devs_increment = 1,
78                 .ncopies        = 1,
79         },
80         [BTRFS_RAID_RAID5] = {
81                 .sub_stripes    = 1,
82                 .dev_stripes    = 1,
83                 .devs_max       = 0,
84                 .devs_min       = 2,
85                 .tolerated_failures = 1,
86                 .devs_increment = 1,
87                 .ncopies        = 2,
88         },
89         [BTRFS_RAID_RAID6] = {
90                 .sub_stripes    = 1,
91                 .dev_stripes    = 1,
92                 .devs_max       = 0,
93                 .devs_min       = 3,
94                 .tolerated_failures = 2,
95                 .devs_increment = 1,
96                 .ncopies        = 3,
97         },
98 };
99
100 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
101         [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
102         [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
103         [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
104         [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
105         [BTRFS_RAID_SINGLE] = 0,
106         [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
107         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
108 };
109
110 /*
111  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
112  * condition is not met. Zero means there's no corresponding
113  * BTRFS_ERROR_DEV_*_NOT_MET value.
114  */
115 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
116         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
117         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
118         [BTRFS_RAID_DUP]    = 0,
119         [BTRFS_RAID_RAID0]  = 0,
120         [BTRFS_RAID_SINGLE] = 0,
121         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
122         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
123 };
124
125 static int init_first_rw_device(struct btrfs_trans_handle *trans,
126                                 struct btrfs_fs_info *fs_info);
127 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
128 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
129 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
130 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
131 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
132                              enum btrfs_map_op op,
133                              u64 logical, u64 *length,
134                              struct btrfs_bio **bbio_ret,
135                              int mirror_num, int need_raid_map);
136
137 /*
138  * Device locking
139  * ==============
140  *
141  * There are several mutexes that protect manipulation of devices and low-level
142  * structures like chunks but not block groups, extents or files
143  *
144  * uuid_mutex (global lock)
145  * ------------------------
146  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
147  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
148  * device) or requested by the device= mount option
149  *
150  * the mutex can be very coarse and can cover long-running operations
151  *
152  * protects: updates to fs_devices counters like missing devices, rw devices,
153  * seeding, structure cloning, openning/closing devices at mount/umount time
154  *
155  * global::fs_devs - add, remove, updates to the global list
156  *
157  * does not protect: manipulation of the fs_devices::devices list!
158  *
159  * btrfs_device::name - renames (write side), read is RCU
160  *
161  * fs_devices::device_list_mutex (per-fs, with RCU)
162  * ------------------------------------------------
163  * protects updates to fs_devices::devices, ie. adding and deleting
164  *
165  * simple list traversal with read-only actions can be done with RCU protection
166  *
167  * may be used to exclude some operations from running concurrently without any
168  * modifications to the list (see write_all_supers)
169  *
170  * volume_mutex
171  * ------------
172  * coarse lock owned by a mounted filesystem; used to exclude some operations
173  * that cannot run in parallel and affect the higher-level properties of the
174  * filesystem like: device add/deleting/resize/replace, or balance
175  *
176  * balance_mutex
177  * -------------
178  * protects balance structures (status, state) and context accessed from
179  * several places (internally, ioctl)
180  *
181  * chunk_mutex
182  * -----------
183  * protects chunks, adding or removing during allocation, trim or when a new
184  * device is added/removed
185  *
186  * cleaner_mutex
187  * -------------
188  * a big lock that is held by the cleaner thread and prevents running subvolume
189  * cleaning together with relocation or delayed iputs
190  *
191  *
192  * Lock nesting
193  * ============
194  *
195  * uuid_mutex
196  *   volume_mutex
197  *     device_list_mutex
198  *       chunk_mutex
199  *     balance_mutex
200  *
201  *
202  * Exclusive operations, BTRFS_FS_EXCL_OP
203  * ======================================
204  *
205  * Maintains the exclusivity of the following operations that apply to the
206  * whole filesystem and cannot run in parallel.
207  *
208  * - Balance (*)
209  * - Device add
210  * - Device remove
211  * - Device replace (*)
212  * - Resize
213  *
214  * The device operations (as above) can be in one of the following states:
215  *
216  * - Running state
217  * - Paused state
218  * - Completed state
219  *
220  * Only device operations marked with (*) can go into the Paused state for the
221  * following reasons:
222  *
223  * - ioctl (only Balance can be Paused through ioctl)
224  * - filesystem remounted as read-only
225  * - filesystem unmounted and mounted as read-only
226  * - system power-cycle and filesystem mounted as read-only
227  * - filesystem or device errors leading to forced read-only
228  *
229  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
230  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
231  * A device operation in Paused or Running state can be canceled or resumed
232  * either by ioctl (Balance only) or when remounted as read-write.
233  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
234  * completed.
235  */
236
237 DEFINE_MUTEX(uuid_mutex);
238 static LIST_HEAD(fs_uuids);
239 struct list_head *btrfs_get_fs_uuids(void)
240 {
241         return &fs_uuids;
242 }
243
244 /*
245  * alloc_fs_devices - allocate struct btrfs_fs_devices
246  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
247  *
248  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
249  * The returned struct is not linked onto any lists and can be destroyed with
250  * kfree() right away.
251  */
252 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
253 {
254         struct btrfs_fs_devices *fs_devs;
255
256         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
257         if (!fs_devs)
258                 return ERR_PTR(-ENOMEM);
259
260         mutex_init(&fs_devs->device_list_mutex);
261
262         INIT_LIST_HEAD(&fs_devs->devices);
263         INIT_LIST_HEAD(&fs_devs->resized_devices);
264         INIT_LIST_HEAD(&fs_devs->alloc_list);
265         INIT_LIST_HEAD(&fs_devs->fs_list);
266         if (fsid)
267                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
268
269         return fs_devs;
270 }
271
272 void btrfs_free_device(struct btrfs_device *device)
273 {
274         rcu_string_free(device->name);
275         bio_put(device->flush_bio);
276         kfree(device);
277 }
278
279 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
280 {
281         struct btrfs_device *device;
282         WARN_ON(fs_devices->opened);
283         while (!list_empty(&fs_devices->devices)) {
284                 device = list_entry(fs_devices->devices.next,
285                                     struct btrfs_device, dev_list);
286                 list_del(&device->dev_list);
287                 btrfs_free_device(device);
288         }
289         kfree(fs_devices);
290 }
291
292 static void btrfs_kobject_uevent(struct block_device *bdev,
293                                  enum kobject_action action)
294 {
295         int ret;
296
297         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
298         if (ret)
299                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
300                         action,
301                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
302                         &disk_to_dev(bdev->bd_disk)->kobj);
303 }
304
305 void __exit btrfs_cleanup_fs_uuids(void)
306 {
307         struct btrfs_fs_devices *fs_devices;
308
309         while (!list_empty(&fs_uuids)) {
310                 fs_devices = list_entry(fs_uuids.next,
311                                         struct btrfs_fs_devices, fs_list);
312                 list_del(&fs_devices->fs_list);
313                 free_fs_devices(fs_devices);
314         }
315 }
316
317 /*
318  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
319  * Returned struct is not linked onto any lists and must be destroyed using
320  * btrfs_free_device.
321  */
322 static struct btrfs_device *__alloc_device(void)
323 {
324         struct btrfs_device *dev;
325
326         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
327         if (!dev)
328                 return ERR_PTR(-ENOMEM);
329
330         /*
331          * Preallocate a bio that's always going to be used for flushing device
332          * barriers and matches the device lifespan
333          */
334         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
335         if (!dev->flush_bio) {
336                 kfree(dev);
337                 return ERR_PTR(-ENOMEM);
338         }
339
340         INIT_LIST_HEAD(&dev->dev_list);
341         INIT_LIST_HEAD(&dev->dev_alloc_list);
342         INIT_LIST_HEAD(&dev->resized_list);
343
344         spin_lock_init(&dev->io_lock);
345
346         atomic_set(&dev->reada_in_flight, 0);
347         atomic_set(&dev->dev_stats_ccnt, 0);
348         btrfs_device_data_ordered_init(dev);
349         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
350         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
351
352         return dev;
353 }
354
355 /*
356  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
357  * return NULL.
358  *
359  * If devid and uuid are both specified, the match must be exact, otherwise
360  * only devid is used.
361  */
362 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
363                 u64 devid, const u8 *uuid)
364 {
365         struct btrfs_device *dev;
366
367         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
368                 if (dev->devid == devid &&
369                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
370                         return dev;
371                 }
372         }
373         return NULL;
374 }
375
376 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
377 {
378         struct btrfs_fs_devices *fs_devices;
379
380         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
381                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
382                         return fs_devices;
383         }
384         return NULL;
385 }
386
387 static int
388 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
389                       int flush, struct block_device **bdev,
390                       struct buffer_head **bh)
391 {
392         int ret;
393
394         *bdev = blkdev_get_by_path(device_path, flags, holder);
395
396         if (IS_ERR(*bdev)) {
397                 ret = PTR_ERR(*bdev);
398                 goto error;
399         }
400
401         if (flush)
402                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
403         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
404         if (ret) {
405                 blkdev_put(*bdev, flags);
406                 goto error;
407         }
408         invalidate_bdev(*bdev);
409         *bh = btrfs_read_dev_super(*bdev);
410         if (IS_ERR(*bh)) {
411                 ret = PTR_ERR(*bh);
412                 blkdev_put(*bdev, flags);
413                 goto error;
414         }
415
416         return 0;
417
418 error:
419         *bdev = NULL;
420         *bh = NULL;
421         return ret;
422 }
423
424 static void requeue_list(struct btrfs_pending_bios *pending_bios,
425                         struct bio *head, struct bio *tail)
426 {
427
428         struct bio *old_head;
429
430         old_head = pending_bios->head;
431         pending_bios->head = head;
432         if (pending_bios->tail)
433                 tail->bi_next = old_head;
434         else
435                 pending_bios->tail = tail;
436 }
437
438 /*
439  * we try to collect pending bios for a device so we don't get a large
440  * number of procs sending bios down to the same device.  This greatly
441  * improves the schedulers ability to collect and merge the bios.
442  *
443  * But, it also turns into a long list of bios to process and that is sure
444  * to eventually make the worker thread block.  The solution here is to
445  * make some progress and then put this work struct back at the end of
446  * the list if the block device is congested.  This way, multiple devices
447  * can make progress from a single worker thread.
448  */
449 static noinline void run_scheduled_bios(struct btrfs_device *device)
450 {
451         struct btrfs_fs_info *fs_info = device->fs_info;
452         struct bio *pending;
453         struct backing_dev_info *bdi;
454         struct btrfs_pending_bios *pending_bios;
455         struct bio *tail;
456         struct bio *cur;
457         int again = 0;
458         unsigned long num_run;
459         unsigned long batch_run = 0;
460         unsigned long last_waited = 0;
461         int force_reg = 0;
462         int sync_pending = 0;
463         struct blk_plug plug;
464
465         /*
466          * this function runs all the bios we've collected for
467          * a particular device.  We don't want to wander off to
468          * another device without first sending all of these down.
469          * So, setup a plug here and finish it off before we return
470          */
471         blk_start_plug(&plug);
472
473         bdi = device->bdev->bd_bdi;
474
475 loop:
476         spin_lock(&device->io_lock);
477
478 loop_lock:
479         num_run = 0;
480
481         /* take all the bios off the list at once and process them
482          * later on (without the lock held).  But, remember the
483          * tail and other pointers so the bios can be properly reinserted
484          * into the list if we hit congestion
485          */
486         if (!force_reg && device->pending_sync_bios.head) {
487                 pending_bios = &device->pending_sync_bios;
488                 force_reg = 1;
489         } else {
490                 pending_bios = &device->pending_bios;
491                 force_reg = 0;
492         }
493
494         pending = pending_bios->head;
495         tail = pending_bios->tail;
496         WARN_ON(pending && !tail);
497
498         /*
499          * if pending was null this time around, no bios need processing
500          * at all and we can stop.  Otherwise it'll loop back up again
501          * and do an additional check so no bios are missed.
502          *
503          * device->running_pending is used to synchronize with the
504          * schedule_bio code.
505          */
506         if (device->pending_sync_bios.head == NULL &&
507             device->pending_bios.head == NULL) {
508                 again = 0;
509                 device->running_pending = 0;
510         } else {
511                 again = 1;
512                 device->running_pending = 1;
513         }
514
515         pending_bios->head = NULL;
516         pending_bios->tail = NULL;
517
518         spin_unlock(&device->io_lock);
519
520         while (pending) {
521
522                 rmb();
523                 /* we want to work on both lists, but do more bios on the
524                  * sync list than the regular list
525                  */
526                 if ((num_run > 32 &&
527                     pending_bios != &device->pending_sync_bios &&
528                     device->pending_sync_bios.head) ||
529                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
530                     device->pending_bios.head)) {
531                         spin_lock(&device->io_lock);
532                         requeue_list(pending_bios, pending, tail);
533                         goto loop_lock;
534                 }
535
536                 cur = pending;
537                 pending = pending->bi_next;
538                 cur->bi_next = NULL;
539
540                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
541
542                 /*
543                  * if we're doing the sync list, record that our
544                  * plug has some sync requests on it
545                  *
546                  * If we're doing the regular list and there are
547                  * sync requests sitting around, unplug before
548                  * we add more
549                  */
550                 if (pending_bios == &device->pending_sync_bios) {
551                         sync_pending = 1;
552                 } else if (sync_pending) {
553                         blk_finish_plug(&plug);
554                         blk_start_plug(&plug);
555                         sync_pending = 0;
556                 }
557
558                 btrfsic_submit_bio(cur);
559                 num_run++;
560                 batch_run++;
561
562                 cond_resched();
563
564                 /*
565                  * we made progress, there is more work to do and the bdi
566                  * is now congested.  Back off and let other work structs
567                  * run instead
568                  */
569                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
570                     fs_info->fs_devices->open_devices > 1) {
571                         struct io_context *ioc;
572
573                         ioc = current->io_context;
574
575                         /*
576                          * the main goal here is that we don't want to
577                          * block if we're going to be able to submit
578                          * more requests without blocking.
579                          *
580                          * This code does two great things, it pokes into
581                          * the elevator code from a filesystem _and_
582                          * it makes assumptions about how batching works.
583                          */
584                         if (ioc && ioc->nr_batch_requests > 0 &&
585                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
586                             (last_waited == 0 ||
587                              ioc->last_waited == last_waited)) {
588                                 /*
589                                  * we want to go through our batch of
590                                  * requests and stop.  So, we copy out
591                                  * the ioc->last_waited time and test
592                                  * against it before looping
593                                  */
594                                 last_waited = ioc->last_waited;
595                                 cond_resched();
596                                 continue;
597                         }
598                         spin_lock(&device->io_lock);
599                         requeue_list(pending_bios, pending, tail);
600                         device->running_pending = 1;
601
602                         spin_unlock(&device->io_lock);
603                         btrfs_queue_work(fs_info->submit_workers,
604                                          &device->work);
605                         goto done;
606                 }
607         }
608
609         cond_resched();
610         if (again)
611                 goto loop;
612
613         spin_lock(&device->io_lock);
614         if (device->pending_bios.head || device->pending_sync_bios.head)
615                 goto loop_lock;
616         spin_unlock(&device->io_lock);
617
618 done:
619         blk_finish_plug(&plug);
620 }
621
622 static void pending_bios_fn(struct btrfs_work *work)
623 {
624         struct btrfs_device *device;
625
626         device = container_of(work, struct btrfs_device, work);
627         run_scheduled_bios(device);
628 }
629
630 /*
631  *  Search and remove all stale (devices which are not mounted) devices.
632  *  When both inputs are NULL, it will search and release all stale devices.
633  *  path:       Optional. When provided will it release all unmounted devices
634  *              matching this path only.
635  *  skip_dev:   Optional. Will skip this device when searching for the stale
636  *              devices.
637  */
638 static void btrfs_free_stale_devices(const char *path,
639                                      struct btrfs_device *skip_dev)
640 {
641         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
642         struct btrfs_device *dev, *tmp_dev;
643
644         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
645
646                 if (fs_devs->opened)
647                         continue;
648
649                 list_for_each_entry_safe(dev, tmp_dev,
650                                          &fs_devs->devices, dev_list) {
651                         int not_found = 0;
652
653                         if (skip_dev && skip_dev == dev)
654                                 continue;
655                         if (path && !dev->name)
656                                 continue;
657
658                         rcu_read_lock();
659                         if (path)
660                                 not_found = strcmp(rcu_str_deref(dev->name),
661                                                    path);
662                         rcu_read_unlock();
663                         if (not_found)
664                                 continue;
665
666                         /* delete the stale device */
667                         if (fs_devs->num_devices == 1) {
668                                 btrfs_sysfs_remove_fsid(fs_devs);
669                                 list_del(&fs_devs->fs_list);
670                                 free_fs_devices(fs_devs);
671                                 break;
672                         } else {
673                                 fs_devs->num_devices--;
674                                 list_del(&dev->dev_list);
675                                 btrfs_free_device(dev);
676                         }
677                 }
678         }
679 }
680
681 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
682                         struct btrfs_device *device, fmode_t flags,
683                         void *holder)
684 {
685         struct request_queue *q;
686         struct block_device *bdev;
687         struct buffer_head *bh;
688         struct btrfs_super_block *disk_super;
689         u64 devid;
690         int ret;
691
692         if (device->bdev)
693                 return -EINVAL;
694         if (!device->name)
695                 return -EINVAL;
696
697         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
698                                     &bdev, &bh);
699         if (ret)
700                 return ret;
701
702         disk_super = (struct btrfs_super_block *)bh->b_data;
703         devid = btrfs_stack_device_id(&disk_super->dev_item);
704         if (devid != device->devid)
705                 goto error_brelse;
706
707         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
708                 goto error_brelse;
709
710         device->generation = btrfs_super_generation(disk_super);
711
712         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
713                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
714                 fs_devices->seeding = 1;
715         } else {
716                 if (bdev_read_only(bdev))
717                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
718                 else
719                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
720         }
721
722         q = bdev_get_queue(bdev);
723         if (!blk_queue_nonrot(q))
724                 fs_devices->rotating = 1;
725
726         device->bdev = bdev;
727         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
728         device->mode = flags;
729
730         fs_devices->open_devices++;
731         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
732             device->devid != BTRFS_DEV_REPLACE_DEVID) {
733                 fs_devices->rw_devices++;
734                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
735         }
736         brelse(bh);
737
738         return 0;
739
740 error_brelse:
741         brelse(bh);
742         blkdev_put(bdev, flags);
743
744         return -EINVAL;
745 }
746
747 /*
748  * Add new device to list of registered devices
749  *
750  * Returns:
751  * device pointer which was just added or updated when successful
752  * error pointer when failed
753  */
754 static noinline struct btrfs_device *device_list_add(const char *path,
755                            struct btrfs_super_block *disk_super)
756 {
757         struct btrfs_device *device;
758         struct btrfs_fs_devices *fs_devices;
759         struct rcu_string *name;
760         u64 found_transid = btrfs_super_generation(disk_super);
761         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
762
763         fs_devices = find_fsid(disk_super->fsid);
764         if (!fs_devices) {
765                 fs_devices = alloc_fs_devices(disk_super->fsid);
766                 if (IS_ERR(fs_devices))
767                         return ERR_CAST(fs_devices);
768
769                 list_add(&fs_devices->fs_list, &fs_uuids);
770
771                 device = NULL;
772         } else {
773                 device = find_device(fs_devices, devid,
774                                 disk_super->dev_item.uuid);
775         }
776
777         if (!device) {
778                 if (fs_devices->opened)
779                         return ERR_PTR(-EBUSY);
780
781                 device = btrfs_alloc_device(NULL, &devid,
782                                             disk_super->dev_item.uuid);
783                 if (IS_ERR(device)) {
784                         /* we can safely leave the fs_devices entry around */
785                         return device;
786                 }
787
788                 name = rcu_string_strdup(path, GFP_NOFS);
789                 if (!name) {
790                         btrfs_free_device(device);
791                         return ERR_PTR(-ENOMEM);
792                 }
793                 rcu_assign_pointer(device->name, name);
794
795                 mutex_lock(&fs_devices->device_list_mutex);
796                 list_add_rcu(&device->dev_list, &fs_devices->devices);
797                 fs_devices->num_devices++;
798                 mutex_unlock(&fs_devices->device_list_mutex);
799
800                 device->fs_devices = fs_devices;
801                 btrfs_free_stale_devices(path, device);
802
803                 if (disk_super->label[0])
804                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
805                                 disk_super->label, devid, found_transid, path);
806                 else
807                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
808                                 disk_super->fsid, devid, found_transid, path);
809
810         } else if (!device->name || strcmp(device->name->str, path)) {
811                 /*
812                  * When FS is already mounted.
813                  * 1. If you are here and if the device->name is NULL that
814                  *    means this device was missing at time of FS mount.
815                  * 2. If you are here and if the device->name is different
816                  *    from 'path' that means either
817                  *      a. The same device disappeared and reappeared with
818                  *         different name. or
819                  *      b. The missing-disk-which-was-replaced, has
820                  *         reappeared now.
821                  *
822                  * We must allow 1 and 2a above. But 2b would be a spurious
823                  * and unintentional.
824                  *
825                  * Further in case of 1 and 2a above, the disk at 'path'
826                  * would have missed some transaction when it was away and
827                  * in case of 2a the stale bdev has to be updated as well.
828                  * 2b must not be allowed at all time.
829                  */
830
831                 /*
832                  * For now, we do allow update to btrfs_fs_device through the
833                  * btrfs dev scan cli after FS has been mounted.  We're still
834                  * tracking a problem where systems fail mount by subvolume id
835                  * when we reject replacement on a mounted FS.
836                  */
837                 if (!fs_devices->opened && found_transid < device->generation) {
838                         /*
839                          * That is if the FS is _not_ mounted and if you
840                          * are here, that means there is more than one
841                          * disk with same uuid and devid.We keep the one
842                          * with larger generation number or the last-in if
843                          * generation are equal.
844                          */
845                         return ERR_PTR(-EEXIST);
846                 }
847
848                 name = rcu_string_strdup(path, GFP_NOFS);
849                 if (!name)
850                         return ERR_PTR(-ENOMEM);
851                 rcu_string_free(device->name);
852                 rcu_assign_pointer(device->name, name);
853                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
854                         fs_devices->missing_devices--;
855                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
856                 }
857         }
858
859         /*
860          * Unmount does not free the btrfs_device struct but would zero
861          * generation along with most of the other members. So just update
862          * it back. We need it to pick the disk with largest generation
863          * (as above).
864          */
865         if (!fs_devices->opened)
866                 device->generation = found_transid;
867
868         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
869
870         return device;
871 }
872
873 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
874 {
875         struct btrfs_fs_devices *fs_devices;
876         struct btrfs_device *device;
877         struct btrfs_device *orig_dev;
878
879         fs_devices = alloc_fs_devices(orig->fsid);
880         if (IS_ERR(fs_devices))
881                 return fs_devices;
882
883         mutex_lock(&orig->device_list_mutex);
884         fs_devices->total_devices = orig->total_devices;
885
886         /* We have held the volume lock, it is safe to get the devices. */
887         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
888                 struct rcu_string *name;
889
890                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
891                                             orig_dev->uuid);
892                 if (IS_ERR(device))
893                         goto error;
894
895                 /*
896                  * This is ok to do without rcu read locked because we hold the
897                  * uuid mutex so nothing we touch in here is going to disappear.
898                  */
899                 if (orig_dev->name) {
900                         name = rcu_string_strdup(orig_dev->name->str,
901                                         GFP_KERNEL);
902                         if (!name) {
903                                 btrfs_free_device(device);
904                                 goto error;
905                         }
906                         rcu_assign_pointer(device->name, name);
907                 }
908
909                 list_add(&device->dev_list, &fs_devices->devices);
910                 device->fs_devices = fs_devices;
911                 fs_devices->num_devices++;
912         }
913         mutex_unlock(&orig->device_list_mutex);
914         return fs_devices;
915 error:
916         mutex_unlock(&orig->device_list_mutex);
917         free_fs_devices(fs_devices);
918         return ERR_PTR(-ENOMEM);
919 }
920
921 /*
922  * After we have read the system tree and know devids belonging to
923  * this filesystem, remove the device which does not belong there.
924  */
925 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
926 {
927         struct btrfs_device *device, *next;
928         struct btrfs_device *latest_dev = NULL;
929
930         mutex_lock(&uuid_mutex);
931 again:
932         /* This is the initialized path, it is safe to release the devices. */
933         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
934                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
935                                                         &device->dev_state)) {
936                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
937                              &device->dev_state) &&
938                              (!latest_dev ||
939                               device->generation > latest_dev->generation)) {
940                                 latest_dev = device;
941                         }
942                         continue;
943                 }
944
945                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
946                         /*
947                          * In the first step, keep the device which has
948                          * the correct fsid and the devid that is used
949                          * for the dev_replace procedure.
950                          * In the second step, the dev_replace state is
951                          * read from the device tree and it is known
952                          * whether the procedure is really active or
953                          * not, which means whether this device is
954                          * used or whether it should be removed.
955                          */
956                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
957                                                   &device->dev_state)) {
958                                 continue;
959                         }
960                 }
961                 if (device->bdev) {
962                         blkdev_put(device->bdev, device->mode);
963                         device->bdev = NULL;
964                         fs_devices->open_devices--;
965                 }
966                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
967                         list_del_init(&device->dev_alloc_list);
968                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
969                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
970                                       &device->dev_state))
971                                 fs_devices->rw_devices--;
972                 }
973                 list_del_init(&device->dev_list);
974                 fs_devices->num_devices--;
975                 btrfs_free_device(device);
976         }
977
978         if (fs_devices->seed) {
979                 fs_devices = fs_devices->seed;
980                 goto again;
981         }
982
983         fs_devices->latest_bdev = latest_dev->bdev;
984
985         mutex_unlock(&uuid_mutex);
986 }
987
988 static void free_device_rcu(struct rcu_head *head)
989 {
990         struct btrfs_device *device;
991
992         device = container_of(head, struct btrfs_device, rcu);
993         btrfs_free_device(device);
994 }
995
996 static void btrfs_close_bdev(struct btrfs_device *device)
997 {
998         if (!device->bdev)
999                 return;
1000
1001         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1002                 sync_blockdev(device->bdev);
1003                 invalidate_bdev(device->bdev);
1004         }
1005
1006         blkdev_put(device->bdev, device->mode);
1007 }
1008
1009 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1010 {
1011         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1012         struct btrfs_device *new_device;
1013         struct rcu_string *name;
1014
1015         if (device->bdev)
1016                 fs_devices->open_devices--;
1017
1018         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1019             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1020                 list_del_init(&device->dev_alloc_list);
1021                 fs_devices->rw_devices--;
1022         }
1023
1024         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1025                 fs_devices->missing_devices--;
1026
1027         new_device = btrfs_alloc_device(NULL, &device->devid,
1028                                         device->uuid);
1029         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1030
1031         /* Safe because we are under uuid_mutex */
1032         if (device->name) {
1033                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1034                 BUG_ON(!name); /* -ENOMEM */
1035                 rcu_assign_pointer(new_device->name, name);
1036         }
1037
1038         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1039         new_device->fs_devices = device->fs_devices;
1040 }
1041
1042 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1043 {
1044         struct btrfs_device *device, *tmp;
1045         struct list_head pending_put;
1046
1047         INIT_LIST_HEAD(&pending_put);
1048
1049         if (--fs_devices->opened > 0)
1050                 return 0;
1051
1052         mutex_lock(&fs_devices->device_list_mutex);
1053         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1054                 btrfs_prepare_close_one_device(device);
1055                 list_add(&device->dev_list, &pending_put);
1056         }
1057         mutex_unlock(&fs_devices->device_list_mutex);
1058
1059         /*
1060          * btrfs_show_devname() is using the device_list_mutex,
1061          * sometimes call to blkdev_put() leads vfs calling
1062          * into this func. So do put outside of device_list_mutex,
1063          * as of now.
1064          */
1065         while (!list_empty(&pending_put)) {
1066                 device = list_first_entry(&pending_put,
1067                                 struct btrfs_device, dev_list);
1068                 list_del(&device->dev_list);
1069                 btrfs_close_bdev(device);
1070                 call_rcu(&device->rcu, free_device_rcu);
1071         }
1072
1073         WARN_ON(fs_devices->open_devices);
1074         WARN_ON(fs_devices->rw_devices);
1075         fs_devices->opened = 0;
1076         fs_devices->seeding = 0;
1077
1078         return 0;
1079 }
1080
1081 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1082 {
1083         struct btrfs_fs_devices *seed_devices = NULL;
1084         int ret;
1085
1086         mutex_lock(&uuid_mutex);
1087         ret = close_fs_devices(fs_devices);
1088         if (!fs_devices->opened) {
1089                 seed_devices = fs_devices->seed;
1090                 fs_devices->seed = NULL;
1091         }
1092         mutex_unlock(&uuid_mutex);
1093
1094         while (seed_devices) {
1095                 fs_devices = seed_devices;
1096                 seed_devices = fs_devices->seed;
1097                 close_fs_devices(fs_devices);
1098                 free_fs_devices(fs_devices);
1099         }
1100         return ret;
1101 }
1102
1103 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1104                                 fmode_t flags, void *holder)
1105 {
1106         struct btrfs_device *device;
1107         struct btrfs_device *latest_dev = NULL;
1108         int ret = 0;
1109
1110         flags |= FMODE_EXCL;
1111
1112         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1113                 /* Just open everything we can; ignore failures here */
1114                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1115                         continue;
1116
1117                 if (!latest_dev ||
1118                     device->generation > latest_dev->generation)
1119                         latest_dev = device;
1120         }
1121         if (fs_devices->open_devices == 0) {
1122                 ret = -EINVAL;
1123                 goto out;
1124         }
1125         fs_devices->opened = 1;
1126         fs_devices->latest_bdev = latest_dev->bdev;
1127         fs_devices->total_rw_bytes = 0;
1128 out:
1129         return ret;
1130 }
1131
1132 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1133 {
1134         struct btrfs_device *dev1, *dev2;
1135
1136         dev1 = list_entry(a, struct btrfs_device, dev_list);
1137         dev2 = list_entry(b, struct btrfs_device, dev_list);
1138
1139         if (dev1->devid < dev2->devid)
1140                 return -1;
1141         else if (dev1->devid > dev2->devid)
1142                 return 1;
1143         return 0;
1144 }
1145
1146 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1147                        fmode_t flags, void *holder)
1148 {
1149         int ret;
1150
1151         mutex_lock(&uuid_mutex);
1152         if (fs_devices->opened) {
1153                 fs_devices->opened++;
1154                 ret = 0;
1155         } else {
1156                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1157                 ret = open_fs_devices(fs_devices, flags, holder);
1158         }
1159         mutex_unlock(&uuid_mutex);
1160         return ret;
1161 }
1162
1163 static void btrfs_release_disk_super(struct page *page)
1164 {
1165         kunmap(page);
1166         put_page(page);
1167 }
1168
1169 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1170                                  struct page **page,
1171                                  struct btrfs_super_block **disk_super)
1172 {
1173         void *p;
1174         pgoff_t index;
1175
1176         /* make sure our super fits in the device */
1177         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1178                 return 1;
1179
1180         /* make sure our super fits in the page */
1181         if (sizeof(**disk_super) > PAGE_SIZE)
1182                 return 1;
1183
1184         /* make sure our super doesn't straddle pages on disk */
1185         index = bytenr >> PAGE_SHIFT;
1186         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1187                 return 1;
1188
1189         /* pull in the page with our super */
1190         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1191                                    index, GFP_KERNEL);
1192
1193         if (IS_ERR_OR_NULL(*page))
1194                 return 1;
1195
1196         p = kmap(*page);
1197
1198         /* align our pointer to the offset of the super block */
1199         *disk_super = p + (bytenr & ~PAGE_MASK);
1200
1201         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1202             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1203                 btrfs_release_disk_super(*page);
1204                 return 1;
1205         }
1206
1207         if ((*disk_super)->label[0] &&
1208                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1209                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1210
1211         return 0;
1212 }
1213
1214 /*
1215  * Look for a btrfs signature on a device. This may be called out of the mount path
1216  * and we are not allowed to call set_blocksize during the scan. The superblock
1217  * is read via pagecache
1218  */
1219 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1220                           struct btrfs_fs_devices **fs_devices_ret)
1221 {
1222         struct btrfs_super_block *disk_super;
1223         struct btrfs_device *device;
1224         struct block_device *bdev;
1225         struct page *page;
1226         int ret = 0;
1227         u64 bytenr;
1228
1229         /*
1230          * we would like to check all the supers, but that would make
1231          * a btrfs mount succeed after a mkfs from a different FS.
1232          * So, we need to add a special mount option to scan for
1233          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1234          */
1235         bytenr = btrfs_sb_offset(0);
1236         flags |= FMODE_EXCL;
1237         mutex_lock(&uuid_mutex);
1238
1239         bdev = blkdev_get_by_path(path, flags, holder);
1240         if (IS_ERR(bdev)) {
1241                 ret = PTR_ERR(bdev);
1242                 goto error;
1243         }
1244
1245         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1246                 ret = -EINVAL;
1247                 goto error_bdev_put;
1248         }
1249
1250         device = device_list_add(path, disk_super);
1251         if (IS_ERR(device))
1252                 ret = PTR_ERR(device);
1253         else
1254                 *fs_devices_ret = device->fs_devices;
1255
1256         btrfs_release_disk_super(page);
1257
1258 error_bdev_put:
1259         blkdev_put(bdev, flags);
1260 error:
1261         mutex_unlock(&uuid_mutex);
1262         return ret;
1263 }
1264
1265 /* helper to account the used device space in the range */
1266 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1267                                    u64 end, u64 *length)
1268 {
1269         struct btrfs_key key;
1270         struct btrfs_root *root = device->fs_info->dev_root;
1271         struct btrfs_dev_extent *dev_extent;
1272         struct btrfs_path *path;
1273         u64 extent_end;
1274         int ret;
1275         int slot;
1276         struct extent_buffer *l;
1277
1278         *length = 0;
1279
1280         if (start >= device->total_bytes ||
1281                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1282                 return 0;
1283
1284         path = btrfs_alloc_path();
1285         if (!path)
1286                 return -ENOMEM;
1287         path->reada = READA_FORWARD;
1288
1289         key.objectid = device->devid;
1290         key.offset = start;
1291         key.type = BTRFS_DEV_EXTENT_KEY;
1292
1293         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1294         if (ret < 0)
1295                 goto out;
1296         if (ret > 0) {
1297                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1298                 if (ret < 0)
1299                         goto out;
1300         }
1301
1302         while (1) {
1303                 l = path->nodes[0];
1304                 slot = path->slots[0];
1305                 if (slot >= btrfs_header_nritems(l)) {
1306                         ret = btrfs_next_leaf(root, path);
1307                         if (ret == 0)
1308                                 continue;
1309                         if (ret < 0)
1310                                 goto out;
1311
1312                         break;
1313                 }
1314                 btrfs_item_key_to_cpu(l, &key, slot);
1315
1316                 if (key.objectid < device->devid)
1317                         goto next;
1318
1319                 if (key.objectid > device->devid)
1320                         break;
1321
1322                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1323                         goto next;
1324
1325                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1326                 extent_end = key.offset + btrfs_dev_extent_length(l,
1327                                                                   dev_extent);
1328                 if (key.offset <= start && extent_end > end) {
1329                         *length = end - start + 1;
1330                         break;
1331                 } else if (key.offset <= start && extent_end > start)
1332                         *length += extent_end - start;
1333                 else if (key.offset > start && extent_end <= end)
1334                         *length += extent_end - key.offset;
1335                 else if (key.offset > start && key.offset <= end) {
1336                         *length += end - key.offset + 1;
1337                         break;
1338                 } else if (key.offset > end)
1339                         break;
1340
1341 next:
1342                 path->slots[0]++;
1343         }
1344         ret = 0;
1345 out:
1346         btrfs_free_path(path);
1347         return ret;
1348 }
1349
1350 static int contains_pending_extent(struct btrfs_transaction *transaction,
1351                                    struct btrfs_device *device,
1352                                    u64 *start, u64 len)
1353 {
1354         struct btrfs_fs_info *fs_info = device->fs_info;
1355         struct extent_map *em;
1356         struct list_head *search_list = &fs_info->pinned_chunks;
1357         int ret = 0;
1358         u64 physical_start = *start;
1359
1360         if (transaction)
1361                 search_list = &transaction->pending_chunks;
1362 again:
1363         list_for_each_entry(em, search_list, list) {
1364                 struct map_lookup *map;
1365                 int i;
1366
1367                 map = em->map_lookup;
1368                 for (i = 0; i < map->num_stripes; i++) {
1369                         u64 end;
1370
1371                         if (map->stripes[i].dev != device)
1372                                 continue;
1373                         if (map->stripes[i].physical >= physical_start + len ||
1374                             map->stripes[i].physical + em->orig_block_len <=
1375                             physical_start)
1376                                 continue;
1377                         /*
1378                          * Make sure that while processing the pinned list we do
1379                          * not override our *start with a lower value, because
1380                          * we can have pinned chunks that fall within this
1381                          * device hole and that have lower physical addresses
1382                          * than the pending chunks we processed before. If we
1383                          * do not take this special care we can end up getting
1384                          * 2 pending chunks that start at the same physical
1385                          * device offsets because the end offset of a pinned
1386                          * chunk can be equal to the start offset of some
1387                          * pending chunk.
1388                          */
1389                         end = map->stripes[i].physical + em->orig_block_len;
1390                         if (end > *start) {
1391                                 *start = end;
1392                                 ret = 1;
1393                         }
1394                 }
1395         }
1396         if (search_list != &fs_info->pinned_chunks) {
1397                 search_list = &fs_info->pinned_chunks;
1398                 goto again;
1399         }
1400
1401         return ret;
1402 }
1403
1404
1405 /*
1406  * find_free_dev_extent_start - find free space in the specified device
1407  * @device:       the device which we search the free space in
1408  * @num_bytes:    the size of the free space that we need
1409  * @search_start: the position from which to begin the search
1410  * @start:        store the start of the free space.
1411  * @len:          the size of the free space. that we find, or the size
1412  *                of the max free space if we don't find suitable free space
1413  *
1414  * this uses a pretty simple search, the expectation is that it is
1415  * called very infrequently and that a given device has a small number
1416  * of extents
1417  *
1418  * @start is used to store the start of the free space if we find. But if we
1419  * don't find suitable free space, it will be used to store the start position
1420  * of the max free space.
1421  *
1422  * @len is used to store the size of the free space that we find.
1423  * But if we don't find suitable free space, it is used to store the size of
1424  * the max free space.
1425  */
1426 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1427                                struct btrfs_device *device, u64 num_bytes,
1428                                u64 search_start, u64 *start, u64 *len)
1429 {
1430         struct btrfs_fs_info *fs_info = device->fs_info;
1431         struct btrfs_root *root = fs_info->dev_root;
1432         struct btrfs_key key;
1433         struct btrfs_dev_extent *dev_extent;
1434         struct btrfs_path *path;
1435         u64 hole_size;
1436         u64 max_hole_start;
1437         u64 max_hole_size;
1438         u64 extent_end;
1439         u64 search_end = device->total_bytes;
1440         int ret;
1441         int slot;
1442         struct extent_buffer *l;
1443
1444         /*
1445          * We don't want to overwrite the superblock on the drive nor any area
1446          * used by the boot loader (grub for example), so we make sure to start
1447          * at an offset of at least 1MB.
1448          */
1449         search_start = max_t(u64, search_start, SZ_1M);
1450
1451         path = btrfs_alloc_path();
1452         if (!path)
1453                 return -ENOMEM;
1454
1455         max_hole_start = search_start;
1456         max_hole_size = 0;
1457
1458 again:
1459         if (search_start >= search_end ||
1460                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1461                 ret = -ENOSPC;
1462                 goto out;
1463         }
1464
1465         path->reada = READA_FORWARD;
1466         path->search_commit_root = 1;
1467         path->skip_locking = 1;
1468
1469         key.objectid = device->devid;
1470         key.offset = search_start;
1471         key.type = BTRFS_DEV_EXTENT_KEY;
1472
1473         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1474         if (ret < 0)
1475                 goto out;
1476         if (ret > 0) {
1477                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1478                 if (ret < 0)
1479                         goto out;
1480         }
1481
1482         while (1) {
1483                 l = path->nodes[0];
1484                 slot = path->slots[0];
1485                 if (slot >= btrfs_header_nritems(l)) {
1486                         ret = btrfs_next_leaf(root, path);
1487                         if (ret == 0)
1488                                 continue;
1489                         if (ret < 0)
1490                                 goto out;
1491
1492                         break;
1493                 }
1494                 btrfs_item_key_to_cpu(l, &key, slot);
1495
1496                 if (key.objectid < device->devid)
1497                         goto next;
1498
1499                 if (key.objectid > device->devid)
1500                         break;
1501
1502                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1503                         goto next;
1504
1505                 if (key.offset > search_start) {
1506                         hole_size = key.offset - search_start;
1507
1508                         /*
1509                          * Have to check before we set max_hole_start, otherwise
1510                          * we could end up sending back this offset anyway.
1511                          */
1512                         if (contains_pending_extent(transaction, device,
1513                                                     &search_start,
1514                                                     hole_size)) {
1515                                 if (key.offset >= search_start) {
1516                                         hole_size = key.offset - search_start;
1517                                 } else {
1518                                         WARN_ON_ONCE(1);
1519                                         hole_size = 0;
1520                                 }
1521                         }
1522
1523                         if (hole_size > max_hole_size) {
1524                                 max_hole_start = search_start;
1525                                 max_hole_size = hole_size;
1526                         }
1527
1528                         /*
1529                          * If this free space is greater than which we need,
1530                          * it must be the max free space that we have found
1531                          * until now, so max_hole_start must point to the start
1532                          * of this free space and the length of this free space
1533                          * is stored in max_hole_size. Thus, we return
1534                          * max_hole_start and max_hole_size and go back to the
1535                          * caller.
1536                          */
1537                         if (hole_size >= num_bytes) {
1538                                 ret = 0;
1539                                 goto out;
1540                         }
1541                 }
1542
1543                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1544                 extent_end = key.offset + btrfs_dev_extent_length(l,
1545                                                                   dev_extent);
1546                 if (extent_end > search_start)
1547                         search_start = extent_end;
1548 next:
1549                 path->slots[0]++;
1550                 cond_resched();
1551         }
1552
1553         /*
1554          * At this point, search_start should be the end of
1555          * allocated dev extents, and when shrinking the device,
1556          * search_end may be smaller than search_start.
1557          */
1558         if (search_end > search_start) {
1559                 hole_size = search_end - search_start;
1560
1561                 if (contains_pending_extent(transaction, device, &search_start,
1562                                             hole_size)) {
1563                         btrfs_release_path(path);
1564                         goto again;
1565                 }
1566
1567                 if (hole_size > max_hole_size) {
1568                         max_hole_start = search_start;
1569                         max_hole_size = hole_size;
1570                 }
1571         }
1572
1573         /* See above. */
1574         if (max_hole_size < num_bytes)
1575                 ret = -ENOSPC;
1576         else
1577                 ret = 0;
1578
1579 out:
1580         btrfs_free_path(path);
1581         *start = max_hole_start;
1582         if (len)
1583                 *len = max_hole_size;
1584         return ret;
1585 }
1586
1587 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1588                          struct btrfs_device *device, u64 num_bytes,
1589                          u64 *start, u64 *len)
1590 {
1591         /* FIXME use last free of some kind */
1592         return find_free_dev_extent_start(trans->transaction, device,
1593                                           num_bytes, 0, start, len);
1594 }
1595
1596 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1597                           struct btrfs_device *device,
1598                           u64 start, u64 *dev_extent_len)
1599 {
1600         struct btrfs_fs_info *fs_info = device->fs_info;
1601         struct btrfs_root *root = fs_info->dev_root;
1602         int ret;
1603         struct btrfs_path *path;
1604         struct btrfs_key key;
1605         struct btrfs_key found_key;
1606         struct extent_buffer *leaf = NULL;
1607         struct btrfs_dev_extent *extent = NULL;
1608
1609         path = btrfs_alloc_path();
1610         if (!path)
1611                 return -ENOMEM;
1612
1613         key.objectid = device->devid;
1614         key.offset = start;
1615         key.type = BTRFS_DEV_EXTENT_KEY;
1616 again:
1617         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1618         if (ret > 0) {
1619                 ret = btrfs_previous_item(root, path, key.objectid,
1620                                           BTRFS_DEV_EXTENT_KEY);
1621                 if (ret)
1622                         goto out;
1623                 leaf = path->nodes[0];
1624                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1625                 extent = btrfs_item_ptr(leaf, path->slots[0],
1626                                         struct btrfs_dev_extent);
1627                 BUG_ON(found_key.offset > start || found_key.offset +
1628                        btrfs_dev_extent_length(leaf, extent) < start);
1629                 key = found_key;
1630                 btrfs_release_path(path);
1631                 goto again;
1632         } else if (ret == 0) {
1633                 leaf = path->nodes[0];
1634                 extent = btrfs_item_ptr(leaf, path->slots[0],
1635                                         struct btrfs_dev_extent);
1636         } else {
1637                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1638                 goto out;
1639         }
1640
1641         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1642
1643         ret = btrfs_del_item(trans, root, path);
1644         if (ret) {
1645                 btrfs_handle_fs_error(fs_info, ret,
1646                                       "Failed to remove dev extent item");
1647         } else {
1648                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1649         }
1650 out:
1651         btrfs_free_path(path);
1652         return ret;
1653 }
1654
1655 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1656                                   struct btrfs_device *device,
1657                                   u64 chunk_offset, u64 start, u64 num_bytes)
1658 {
1659         int ret;
1660         struct btrfs_path *path;
1661         struct btrfs_fs_info *fs_info = device->fs_info;
1662         struct btrfs_root *root = fs_info->dev_root;
1663         struct btrfs_dev_extent *extent;
1664         struct extent_buffer *leaf;
1665         struct btrfs_key key;
1666
1667         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1668         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1669         path = btrfs_alloc_path();
1670         if (!path)
1671                 return -ENOMEM;
1672
1673         key.objectid = device->devid;
1674         key.offset = start;
1675         key.type = BTRFS_DEV_EXTENT_KEY;
1676         ret = btrfs_insert_empty_item(trans, root, path, &key,
1677                                       sizeof(*extent));
1678         if (ret)
1679                 goto out;
1680
1681         leaf = path->nodes[0];
1682         extent = btrfs_item_ptr(leaf, path->slots[0],
1683                                 struct btrfs_dev_extent);
1684         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1685                                         BTRFS_CHUNK_TREE_OBJECTID);
1686         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1687                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1688         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1689
1690         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1691         btrfs_mark_buffer_dirty(leaf);
1692 out:
1693         btrfs_free_path(path);
1694         return ret;
1695 }
1696
1697 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1698 {
1699         struct extent_map_tree *em_tree;
1700         struct extent_map *em;
1701         struct rb_node *n;
1702         u64 ret = 0;
1703
1704         em_tree = &fs_info->mapping_tree.map_tree;
1705         read_lock(&em_tree->lock);
1706         n = rb_last(&em_tree->map);
1707         if (n) {
1708                 em = rb_entry(n, struct extent_map, rb_node);
1709                 ret = em->start + em->len;
1710         }
1711         read_unlock(&em_tree->lock);
1712
1713         return ret;
1714 }
1715
1716 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1717                                     u64 *devid_ret)
1718 {
1719         int ret;
1720         struct btrfs_key key;
1721         struct btrfs_key found_key;
1722         struct btrfs_path *path;
1723
1724         path = btrfs_alloc_path();
1725         if (!path)
1726                 return -ENOMEM;
1727
1728         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1729         key.type = BTRFS_DEV_ITEM_KEY;
1730         key.offset = (u64)-1;
1731
1732         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1733         if (ret < 0)
1734                 goto error;
1735
1736         BUG_ON(ret == 0); /* Corruption */
1737
1738         ret = btrfs_previous_item(fs_info->chunk_root, path,
1739                                   BTRFS_DEV_ITEMS_OBJECTID,
1740                                   BTRFS_DEV_ITEM_KEY);
1741         if (ret) {
1742                 *devid_ret = 1;
1743         } else {
1744                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1745                                       path->slots[0]);
1746                 *devid_ret = found_key.offset + 1;
1747         }
1748         ret = 0;
1749 error:
1750         btrfs_free_path(path);
1751         return ret;
1752 }
1753
1754 /*
1755  * the device information is stored in the chunk root
1756  * the btrfs_device struct should be fully filled in
1757  */
1758 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1759                             struct btrfs_fs_info *fs_info,
1760                             struct btrfs_device *device)
1761 {
1762         struct btrfs_root *root = fs_info->chunk_root;
1763         int ret;
1764         struct btrfs_path *path;
1765         struct btrfs_dev_item *dev_item;
1766         struct extent_buffer *leaf;
1767         struct btrfs_key key;
1768         unsigned long ptr;
1769
1770         path = btrfs_alloc_path();
1771         if (!path)
1772                 return -ENOMEM;
1773
1774         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1775         key.type = BTRFS_DEV_ITEM_KEY;
1776         key.offset = device->devid;
1777
1778         ret = btrfs_insert_empty_item(trans, root, path, &key,
1779                                       sizeof(*dev_item));
1780         if (ret)
1781                 goto out;
1782
1783         leaf = path->nodes[0];
1784         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1785
1786         btrfs_set_device_id(leaf, dev_item, device->devid);
1787         btrfs_set_device_generation(leaf, dev_item, 0);
1788         btrfs_set_device_type(leaf, dev_item, device->type);
1789         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1790         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1791         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1792         btrfs_set_device_total_bytes(leaf, dev_item,
1793                                      btrfs_device_get_disk_total_bytes(device));
1794         btrfs_set_device_bytes_used(leaf, dev_item,
1795                                     btrfs_device_get_bytes_used(device));
1796         btrfs_set_device_group(leaf, dev_item, 0);
1797         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1798         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1799         btrfs_set_device_start_offset(leaf, dev_item, 0);
1800
1801         ptr = btrfs_device_uuid(dev_item);
1802         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1803         ptr = btrfs_device_fsid(dev_item);
1804         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1805         btrfs_mark_buffer_dirty(leaf);
1806
1807         ret = 0;
1808 out:
1809         btrfs_free_path(path);
1810         return ret;
1811 }
1812
1813 /*
1814  * Function to update ctime/mtime for a given device path.
1815  * Mainly used for ctime/mtime based probe like libblkid.
1816  */
1817 static void update_dev_time(const char *path_name)
1818 {
1819         struct file *filp;
1820
1821         filp = filp_open(path_name, O_RDWR, 0);
1822         if (IS_ERR(filp))
1823                 return;
1824         file_update_time(filp);
1825         filp_close(filp, NULL);
1826 }
1827
1828 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1829                              struct btrfs_device *device)
1830 {
1831         struct btrfs_root *root = fs_info->chunk_root;
1832         int ret;
1833         struct btrfs_path *path;
1834         struct btrfs_key key;
1835         struct btrfs_trans_handle *trans;
1836
1837         path = btrfs_alloc_path();
1838         if (!path)
1839                 return -ENOMEM;
1840
1841         trans = btrfs_start_transaction(root, 0);
1842         if (IS_ERR(trans)) {
1843                 btrfs_free_path(path);
1844                 return PTR_ERR(trans);
1845         }
1846         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1847         key.type = BTRFS_DEV_ITEM_KEY;
1848         key.offset = device->devid;
1849
1850         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1851         if (ret) {
1852                 if (ret > 0)
1853                         ret = -ENOENT;
1854                 btrfs_abort_transaction(trans, ret);
1855                 btrfs_end_transaction(trans);
1856                 goto out;
1857         }
1858
1859         ret = btrfs_del_item(trans, root, path);
1860         if (ret) {
1861                 btrfs_abort_transaction(trans, ret);
1862                 btrfs_end_transaction(trans);
1863         }
1864
1865 out:
1866         btrfs_free_path(path);
1867         if (!ret)
1868                 ret = btrfs_commit_transaction(trans);
1869         return ret;
1870 }
1871
1872 /*
1873  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1874  * filesystem. It's up to the caller to adjust that number regarding eg. device
1875  * replace.
1876  */
1877 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1878                 u64 num_devices)
1879 {
1880         u64 all_avail;
1881         unsigned seq;
1882         int i;
1883
1884         do {
1885                 seq = read_seqbegin(&fs_info->profiles_lock);
1886
1887                 all_avail = fs_info->avail_data_alloc_bits |
1888                             fs_info->avail_system_alloc_bits |
1889                             fs_info->avail_metadata_alloc_bits;
1890         } while (read_seqretry(&fs_info->profiles_lock, seq));
1891
1892         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1893                 if (!(all_avail & btrfs_raid_group[i]))
1894                         continue;
1895
1896                 if (num_devices < btrfs_raid_array[i].devs_min) {
1897                         int ret = btrfs_raid_mindev_error[i];
1898
1899                         if (ret)
1900                                 return ret;
1901                 }
1902         }
1903
1904         return 0;
1905 }
1906
1907 static struct btrfs_device * btrfs_find_next_active_device(
1908                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1909 {
1910         struct btrfs_device *next_device;
1911
1912         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1913                 if (next_device != device &&
1914                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1915                     && next_device->bdev)
1916                         return next_device;
1917         }
1918
1919         return NULL;
1920 }
1921
1922 /*
1923  * Helper function to check if the given device is part of s_bdev / latest_bdev
1924  * and replace it with the provided or the next active device, in the context
1925  * where this function called, there should be always be another device (or
1926  * this_dev) which is active.
1927  */
1928 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1929                 struct btrfs_device *device, struct btrfs_device *this_dev)
1930 {
1931         struct btrfs_device *next_device;
1932
1933         if (this_dev)
1934                 next_device = this_dev;
1935         else
1936                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1937                                                                 device);
1938         ASSERT(next_device);
1939
1940         if (fs_info->sb->s_bdev &&
1941                         (fs_info->sb->s_bdev == device->bdev))
1942                 fs_info->sb->s_bdev = next_device->bdev;
1943
1944         if (fs_info->fs_devices->latest_bdev == device->bdev)
1945                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1946 }
1947
1948 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1949                 u64 devid)
1950 {
1951         struct btrfs_device *device;
1952         struct btrfs_fs_devices *cur_devices;
1953         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1954         u64 num_devices;
1955         int ret = 0;
1956
1957         mutex_lock(&uuid_mutex);
1958
1959         num_devices = fs_devices->num_devices;
1960         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1961         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1962                 WARN_ON(num_devices < 1);
1963                 num_devices--;
1964         }
1965         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1966
1967         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1968         if (ret)
1969                 goto out;
1970
1971         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1972                                            &device);
1973         if (ret)
1974                 goto out;
1975
1976         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1977                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1978                 goto out;
1979         }
1980
1981         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1982             fs_info->fs_devices->rw_devices == 1) {
1983                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1984                 goto out;
1985         }
1986
1987         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1988                 mutex_lock(&fs_info->chunk_mutex);
1989                 list_del_init(&device->dev_alloc_list);
1990                 device->fs_devices->rw_devices--;
1991                 mutex_unlock(&fs_info->chunk_mutex);
1992         }
1993
1994         mutex_unlock(&uuid_mutex);
1995         ret = btrfs_shrink_device(device, 0);
1996         mutex_lock(&uuid_mutex);
1997         if (ret)
1998                 goto error_undo;
1999
2000         /*
2001          * TODO: the superblock still includes this device in its num_devices
2002          * counter although write_all_supers() is not locked out. This
2003          * could give a filesystem state which requires a degraded mount.
2004          */
2005         ret = btrfs_rm_dev_item(fs_info, device);
2006         if (ret)
2007                 goto error_undo;
2008
2009         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2010         btrfs_scrub_cancel_dev(fs_info, device);
2011
2012         /*
2013          * the device list mutex makes sure that we don't change
2014          * the device list while someone else is writing out all
2015          * the device supers. Whoever is writing all supers, should
2016          * lock the device list mutex before getting the number of
2017          * devices in the super block (super_copy). Conversely,
2018          * whoever updates the number of devices in the super block
2019          * (super_copy) should hold the device list mutex.
2020          */
2021
2022         cur_devices = device->fs_devices;
2023         mutex_lock(&fs_devices->device_list_mutex);
2024         list_del_rcu(&device->dev_list);
2025
2026         device->fs_devices->num_devices--;
2027         device->fs_devices->total_devices--;
2028
2029         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2030                 device->fs_devices->missing_devices--;
2031
2032         btrfs_assign_next_active_device(fs_info, device, NULL);
2033
2034         if (device->bdev) {
2035                 device->fs_devices->open_devices--;
2036                 /* remove sysfs entry */
2037                 btrfs_sysfs_rm_device_link(fs_devices, device);
2038         }
2039
2040         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2041         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2042         mutex_unlock(&fs_devices->device_list_mutex);
2043
2044         /*
2045          * at this point, the device is zero sized and detached from
2046          * the devices list.  All that's left is to zero out the old
2047          * supers and free the device.
2048          */
2049         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2050                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2051
2052         btrfs_close_bdev(device);
2053         call_rcu(&device->rcu, free_device_rcu);
2054
2055         if (cur_devices->open_devices == 0) {
2056                 while (fs_devices) {
2057                         if (fs_devices->seed == cur_devices) {
2058                                 fs_devices->seed = cur_devices->seed;
2059                                 break;
2060                         }
2061                         fs_devices = fs_devices->seed;
2062                 }
2063                 cur_devices->seed = NULL;
2064                 close_fs_devices(cur_devices);
2065                 free_fs_devices(cur_devices);
2066         }
2067
2068 out:
2069         mutex_unlock(&uuid_mutex);
2070         return ret;
2071
2072 error_undo:
2073         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2074                 mutex_lock(&fs_info->chunk_mutex);
2075                 list_add(&device->dev_alloc_list,
2076                          &fs_devices->alloc_list);
2077                 device->fs_devices->rw_devices++;
2078                 mutex_unlock(&fs_info->chunk_mutex);
2079         }
2080         goto out;
2081 }
2082
2083 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
2084                                         struct btrfs_device *srcdev)
2085 {
2086         struct btrfs_fs_devices *fs_devices;
2087
2088         lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2089
2090         /*
2091          * in case of fs with no seed, srcdev->fs_devices will point
2092          * to fs_devices of fs_info. However when the dev being replaced is
2093          * a seed dev it will point to the seed's local fs_devices. In short
2094          * srcdev will have its correct fs_devices in both the cases.
2095          */
2096         fs_devices = srcdev->fs_devices;
2097
2098         list_del_rcu(&srcdev->dev_list);
2099         list_del(&srcdev->dev_alloc_list);
2100         fs_devices->num_devices--;
2101         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2102                 fs_devices->missing_devices--;
2103
2104         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2105                 fs_devices->rw_devices--;
2106
2107         if (srcdev->bdev)
2108                 fs_devices->open_devices--;
2109 }
2110
2111 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2112                                       struct btrfs_device *srcdev)
2113 {
2114         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2115
2116         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2117                 /* zero out the old super if it is writable */
2118                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2119         }
2120
2121         btrfs_close_bdev(srcdev);
2122         call_rcu(&srcdev->rcu, free_device_rcu);
2123
2124         /* if this is no devs we rather delete the fs_devices */
2125         if (!fs_devices->num_devices) {
2126                 struct btrfs_fs_devices *tmp_fs_devices;
2127
2128                 /*
2129                  * On a mounted FS, num_devices can't be zero unless it's a
2130                  * seed. In case of a seed device being replaced, the replace
2131                  * target added to the sprout FS, so there will be no more
2132                  * device left under the seed FS.
2133                  */
2134                 ASSERT(fs_devices->seeding);
2135
2136                 tmp_fs_devices = fs_info->fs_devices;
2137                 while (tmp_fs_devices) {
2138                         if (tmp_fs_devices->seed == fs_devices) {
2139                                 tmp_fs_devices->seed = fs_devices->seed;
2140                                 break;
2141                         }
2142                         tmp_fs_devices = tmp_fs_devices->seed;
2143                 }
2144                 fs_devices->seed = NULL;
2145                 close_fs_devices(fs_devices);
2146                 free_fs_devices(fs_devices);
2147         }
2148 }
2149
2150 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2151                                       struct btrfs_device *tgtdev)
2152 {
2153         mutex_lock(&uuid_mutex);
2154         WARN_ON(!tgtdev);
2155         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2156
2157         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2158
2159         if (tgtdev->bdev)
2160                 fs_info->fs_devices->open_devices--;
2161
2162         fs_info->fs_devices->num_devices--;
2163
2164         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2165
2166         list_del_rcu(&tgtdev->dev_list);
2167
2168         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2169         mutex_unlock(&uuid_mutex);
2170
2171         /*
2172          * The update_dev_time() with in btrfs_scratch_superblocks()
2173          * may lead to a call to btrfs_show_devname() which will try
2174          * to hold device_list_mutex. And here this device
2175          * is already out of device list, so we don't have to hold
2176          * the device_list_mutex lock.
2177          */
2178         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2179
2180         btrfs_close_bdev(tgtdev);
2181         call_rcu(&tgtdev->rcu, free_device_rcu);
2182 }
2183
2184 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2185                                      const char *device_path,
2186                                      struct btrfs_device **device)
2187 {
2188         int ret = 0;
2189         struct btrfs_super_block *disk_super;
2190         u64 devid;
2191         u8 *dev_uuid;
2192         struct block_device *bdev;
2193         struct buffer_head *bh;
2194
2195         *device = NULL;
2196         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2197                                     fs_info->bdev_holder, 0, &bdev, &bh);
2198         if (ret)
2199                 return ret;
2200         disk_super = (struct btrfs_super_block *)bh->b_data;
2201         devid = btrfs_stack_device_id(&disk_super->dev_item);
2202         dev_uuid = disk_super->dev_item.uuid;
2203         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2204         brelse(bh);
2205         if (!*device)
2206                 ret = -ENOENT;
2207         blkdev_put(bdev, FMODE_READ);
2208         return ret;
2209 }
2210
2211 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2212                                          const char *device_path,
2213                                          struct btrfs_device **device)
2214 {
2215         *device = NULL;
2216         if (strcmp(device_path, "missing") == 0) {
2217                 struct list_head *devices;
2218                 struct btrfs_device *tmp;
2219
2220                 devices = &fs_info->fs_devices->devices;
2221                 /*
2222                  * It is safe to read the devices since the volume_mutex
2223                  * is held by the caller.
2224                  */
2225                 list_for_each_entry(tmp, devices, dev_list) {
2226                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2227                                         &tmp->dev_state) && !tmp->bdev) {
2228                                 *device = tmp;
2229                                 break;
2230                         }
2231                 }
2232
2233                 if (!*device)
2234                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2235
2236                 return 0;
2237         } else {
2238                 return btrfs_find_device_by_path(fs_info, device_path, device);
2239         }
2240 }
2241
2242 /*
2243  * Lookup a device given by device id, or the path if the id is 0.
2244  */
2245 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2246                                  const char *devpath,
2247                                  struct btrfs_device **device)
2248 {
2249         int ret;
2250
2251         if (devid) {
2252                 ret = 0;
2253                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2254                 if (!*device)
2255                         ret = -ENOENT;
2256         } else {
2257                 if (!devpath || !devpath[0])
2258                         return -EINVAL;
2259
2260                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2261                                                            device);
2262         }
2263         return ret;
2264 }
2265
2266 /*
2267  * does all the dirty work required for changing file system's UUID.
2268  */
2269 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2270 {
2271         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2272         struct btrfs_fs_devices *old_devices;
2273         struct btrfs_fs_devices *seed_devices;
2274         struct btrfs_super_block *disk_super = fs_info->super_copy;
2275         struct btrfs_device *device;
2276         u64 super_flags;
2277
2278         lockdep_assert_held(&uuid_mutex);
2279         if (!fs_devices->seeding)
2280                 return -EINVAL;
2281
2282         seed_devices = alloc_fs_devices(NULL);
2283         if (IS_ERR(seed_devices))
2284                 return PTR_ERR(seed_devices);
2285
2286         old_devices = clone_fs_devices(fs_devices);
2287         if (IS_ERR(old_devices)) {
2288                 kfree(seed_devices);
2289                 return PTR_ERR(old_devices);
2290         }
2291
2292         list_add(&old_devices->fs_list, &fs_uuids);
2293
2294         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2295         seed_devices->opened = 1;
2296         INIT_LIST_HEAD(&seed_devices->devices);
2297         INIT_LIST_HEAD(&seed_devices->alloc_list);
2298         mutex_init(&seed_devices->device_list_mutex);
2299
2300         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2301         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2302                               synchronize_rcu);
2303         list_for_each_entry(device, &seed_devices->devices, dev_list)
2304                 device->fs_devices = seed_devices;
2305
2306         mutex_lock(&fs_info->chunk_mutex);
2307         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2308         mutex_unlock(&fs_info->chunk_mutex);
2309
2310         fs_devices->seeding = 0;
2311         fs_devices->num_devices = 0;
2312         fs_devices->open_devices = 0;
2313         fs_devices->missing_devices = 0;
2314         fs_devices->rotating = 0;
2315         fs_devices->seed = seed_devices;
2316
2317         generate_random_uuid(fs_devices->fsid);
2318         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2319         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2320         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2321
2322         super_flags = btrfs_super_flags(disk_super) &
2323                       ~BTRFS_SUPER_FLAG_SEEDING;
2324         btrfs_set_super_flags(disk_super, super_flags);
2325
2326         return 0;
2327 }
2328
2329 /*
2330  * Store the expected generation for seed devices in device items.
2331  */
2332 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2333                                struct btrfs_fs_info *fs_info)
2334 {
2335         struct btrfs_root *root = fs_info->chunk_root;
2336         struct btrfs_path *path;
2337         struct extent_buffer *leaf;
2338         struct btrfs_dev_item *dev_item;
2339         struct btrfs_device *device;
2340         struct btrfs_key key;
2341         u8 fs_uuid[BTRFS_FSID_SIZE];
2342         u8 dev_uuid[BTRFS_UUID_SIZE];
2343         u64 devid;
2344         int ret;
2345
2346         path = btrfs_alloc_path();
2347         if (!path)
2348                 return -ENOMEM;
2349
2350         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2351         key.offset = 0;
2352         key.type = BTRFS_DEV_ITEM_KEY;
2353
2354         while (1) {
2355                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2356                 if (ret < 0)
2357                         goto error;
2358
2359                 leaf = path->nodes[0];
2360 next_slot:
2361                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2362                         ret = btrfs_next_leaf(root, path);
2363                         if (ret > 0)
2364                                 break;
2365                         if (ret < 0)
2366                                 goto error;
2367                         leaf = path->nodes[0];
2368                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2369                         btrfs_release_path(path);
2370                         continue;
2371                 }
2372
2373                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2374                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2375                     key.type != BTRFS_DEV_ITEM_KEY)
2376                         break;
2377
2378                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2379                                           struct btrfs_dev_item);
2380                 devid = btrfs_device_id(leaf, dev_item);
2381                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2382                                    BTRFS_UUID_SIZE);
2383                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2384                                    BTRFS_FSID_SIZE);
2385                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2386                 BUG_ON(!device); /* Logic error */
2387
2388                 if (device->fs_devices->seeding) {
2389                         btrfs_set_device_generation(leaf, dev_item,
2390                                                     device->generation);
2391                         btrfs_mark_buffer_dirty(leaf);
2392                 }
2393
2394                 path->slots[0]++;
2395                 goto next_slot;
2396         }
2397         ret = 0;
2398 error:
2399         btrfs_free_path(path);
2400         return ret;
2401 }
2402
2403 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2404 {
2405         struct btrfs_root *root = fs_info->dev_root;
2406         struct request_queue *q;
2407         struct btrfs_trans_handle *trans;
2408         struct btrfs_device *device;
2409         struct block_device *bdev;
2410         struct list_head *devices;
2411         struct super_block *sb = fs_info->sb;
2412         struct rcu_string *name;
2413         u64 tmp;
2414         int seeding_dev = 0;
2415         int ret = 0;
2416         bool unlocked = false;
2417
2418         if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2419                 return -EROFS;
2420
2421         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2422                                   fs_info->bdev_holder);
2423         if (IS_ERR(bdev))
2424                 return PTR_ERR(bdev);
2425
2426         if (fs_info->fs_devices->seeding) {
2427                 seeding_dev = 1;
2428                 down_write(&sb->s_umount);
2429                 mutex_lock(&uuid_mutex);
2430         }
2431
2432         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2433
2434         devices = &fs_info->fs_devices->devices;
2435
2436         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2437         list_for_each_entry(device, devices, dev_list) {
2438                 if (device->bdev == bdev) {
2439                         ret = -EEXIST;
2440                         mutex_unlock(
2441                                 &fs_info->fs_devices->device_list_mutex);
2442                         goto error;
2443                 }
2444         }
2445         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2446
2447         device = btrfs_alloc_device(fs_info, NULL, NULL);
2448         if (IS_ERR(device)) {
2449                 /* we can safely leave the fs_devices entry around */
2450                 ret = PTR_ERR(device);
2451                 goto error;
2452         }
2453
2454         name = rcu_string_strdup(device_path, GFP_KERNEL);
2455         if (!name) {
2456                 ret = -ENOMEM;
2457                 goto error_free_device;
2458         }
2459         rcu_assign_pointer(device->name, name);
2460
2461         trans = btrfs_start_transaction(root, 0);
2462         if (IS_ERR(trans)) {
2463                 ret = PTR_ERR(trans);
2464                 goto error_free_device;
2465         }
2466
2467         q = bdev_get_queue(bdev);
2468         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2469         device->generation = trans->transid;
2470         device->io_width = fs_info->sectorsize;
2471         device->io_align = fs_info->sectorsize;
2472         device->sector_size = fs_info->sectorsize;
2473         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2474                                          fs_info->sectorsize);
2475         device->disk_total_bytes = device->total_bytes;
2476         device->commit_total_bytes = device->total_bytes;
2477         device->fs_info = fs_info;
2478         device->bdev = bdev;
2479         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2480         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2481         device->mode = FMODE_EXCL;
2482         device->dev_stats_valid = 1;
2483         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2484
2485         if (seeding_dev) {
2486                 sb->s_flags &= ~SB_RDONLY;
2487                 ret = btrfs_prepare_sprout(fs_info);
2488                 if (ret) {
2489                         btrfs_abort_transaction(trans, ret);
2490                         goto error_trans;
2491                 }
2492         }
2493
2494         device->fs_devices = fs_info->fs_devices;
2495
2496         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2497         mutex_lock(&fs_info->chunk_mutex);
2498         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2499         list_add(&device->dev_alloc_list,
2500                  &fs_info->fs_devices->alloc_list);
2501         fs_info->fs_devices->num_devices++;
2502         fs_info->fs_devices->open_devices++;
2503         fs_info->fs_devices->rw_devices++;
2504         fs_info->fs_devices->total_devices++;
2505         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2506
2507         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2508
2509         if (!blk_queue_nonrot(q))
2510                 fs_info->fs_devices->rotating = 1;
2511
2512         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2513         btrfs_set_super_total_bytes(fs_info->super_copy,
2514                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2515
2516         tmp = btrfs_super_num_devices(fs_info->super_copy);
2517         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2518
2519         /* add sysfs device entry */
2520         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2521
2522         /*
2523          * we've got more storage, clear any full flags on the space
2524          * infos
2525          */
2526         btrfs_clear_space_info_full(fs_info);
2527
2528         mutex_unlock(&fs_info->chunk_mutex);
2529         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2530
2531         if (seeding_dev) {
2532                 mutex_lock(&fs_info->chunk_mutex);
2533                 ret = init_first_rw_device(trans, fs_info);
2534                 mutex_unlock(&fs_info->chunk_mutex);
2535                 if (ret) {
2536                         btrfs_abort_transaction(trans, ret);
2537                         goto error_sysfs;
2538                 }
2539         }
2540
2541         ret = btrfs_add_dev_item(trans, fs_info, device);
2542         if (ret) {
2543                 btrfs_abort_transaction(trans, ret);
2544                 goto error_sysfs;
2545         }
2546
2547         if (seeding_dev) {
2548                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2549
2550                 ret = btrfs_finish_sprout(trans, fs_info);
2551                 if (ret) {
2552                         btrfs_abort_transaction(trans, ret);
2553                         goto error_sysfs;
2554                 }
2555
2556                 /* Sprouting would change fsid of the mounted root,
2557                  * so rename the fsid on the sysfs
2558                  */
2559                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2560                                                 fs_info->fsid);
2561                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2562                         btrfs_warn(fs_info,
2563                                    "sysfs: failed to create fsid for sprout");
2564         }
2565
2566         ret = btrfs_commit_transaction(trans);
2567
2568         if (seeding_dev) {
2569                 mutex_unlock(&uuid_mutex);
2570                 up_write(&sb->s_umount);
2571                 unlocked = true;
2572
2573                 if (ret) /* transaction commit */
2574                         return ret;
2575
2576                 ret = btrfs_relocate_sys_chunks(fs_info);
2577                 if (ret < 0)
2578                         btrfs_handle_fs_error(fs_info, ret,
2579                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2580                 trans = btrfs_attach_transaction(root);
2581                 if (IS_ERR(trans)) {
2582                         if (PTR_ERR(trans) == -ENOENT)
2583                                 return 0;
2584                         ret = PTR_ERR(trans);
2585                         trans = NULL;
2586                         goto error_sysfs;
2587                 }
2588                 ret = btrfs_commit_transaction(trans);
2589         }
2590
2591         /* Update ctime/mtime for libblkid */
2592         update_dev_time(device_path);
2593         return ret;
2594
2595 error_sysfs:
2596         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2597 error_trans:
2598         if (seeding_dev)
2599                 sb->s_flags |= SB_RDONLY;
2600         if (trans)
2601                 btrfs_end_transaction(trans);
2602 error_free_device:
2603         btrfs_free_device(device);
2604 error:
2605         blkdev_put(bdev, FMODE_EXCL);
2606         if (seeding_dev && !unlocked) {
2607                 mutex_unlock(&uuid_mutex);
2608                 up_write(&sb->s_umount);
2609         }
2610         return ret;
2611 }
2612
2613 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2614                                         struct btrfs_device *device)
2615 {
2616         int ret;
2617         struct btrfs_path *path;
2618         struct btrfs_root *root = device->fs_info->chunk_root;
2619         struct btrfs_dev_item *dev_item;
2620         struct extent_buffer *leaf;
2621         struct btrfs_key key;
2622
2623         path = btrfs_alloc_path();
2624         if (!path)
2625                 return -ENOMEM;
2626
2627         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2628         key.type = BTRFS_DEV_ITEM_KEY;
2629         key.offset = device->devid;
2630
2631         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2632         if (ret < 0)
2633                 goto out;
2634
2635         if (ret > 0) {
2636                 ret = -ENOENT;
2637                 goto out;
2638         }
2639
2640         leaf = path->nodes[0];
2641         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2642
2643         btrfs_set_device_id(leaf, dev_item, device->devid);
2644         btrfs_set_device_type(leaf, dev_item, device->type);
2645         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2646         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2647         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2648         btrfs_set_device_total_bytes(leaf, dev_item,
2649                                      btrfs_device_get_disk_total_bytes(device));
2650         btrfs_set_device_bytes_used(leaf, dev_item,
2651                                     btrfs_device_get_bytes_used(device));
2652         btrfs_mark_buffer_dirty(leaf);
2653
2654 out:
2655         btrfs_free_path(path);
2656         return ret;
2657 }
2658
2659 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2660                       struct btrfs_device *device, u64 new_size)
2661 {
2662         struct btrfs_fs_info *fs_info = device->fs_info;
2663         struct btrfs_super_block *super_copy = fs_info->super_copy;
2664         struct btrfs_fs_devices *fs_devices;
2665         u64 old_total;
2666         u64 diff;
2667
2668         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2669                 return -EACCES;
2670
2671         new_size = round_down(new_size, fs_info->sectorsize);
2672
2673         mutex_lock(&fs_info->chunk_mutex);
2674         old_total = btrfs_super_total_bytes(super_copy);
2675         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2676
2677         if (new_size <= device->total_bytes ||
2678             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2679                 mutex_unlock(&fs_info->chunk_mutex);
2680                 return -EINVAL;
2681         }
2682
2683         fs_devices = fs_info->fs_devices;
2684
2685         btrfs_set_super_total_bytes(super_copy,
2686                         round_down(old_total + diff, fs_info->sectorsize));
2687         device->fs_devices->total_rw_bytes += diff;
2688
2689         btrfs_device_set_total_bytes(device, new_size);
2690         btrfs_device_set_disk_total_bytes(device, new_size);
2691         btrfs_clear_space_info_full(device->fs_info);
2692         if (list_empty(&device->resized_list))
2693                 list_add_tail(&device->resized_list,
2694                               &fs_devices->resized_devices);
2695         mutex_unlock(&fs_info->chunk_mutex);
2696
2697         return btrfs_update_device(trans, device);
2698 }
2699
2700 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2701                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2702 {
2703         struct btrfs_root *root = fs_info->chunk_root;
2704         int ret;
2705         struct btrfs_path *path;
2706         struct btrfs_key key;
2707
2708         path = btrfs_alloc_path();
2709         if (!path)
2710                 return -ENOMEM;
2711
2712         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2713         key.offset = chunk_offset;
2714         key.type = BTRFS_CHUNK_ITEM_KEY;
2715
2716         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2717         if (ret < 0)
2718                 goto out;
2719         else if (ret > 0) { /* Logic error or corruption */
2720                 btrfs_handle_fs_error(fs_info, -ENOENT,
2721                                       "Failed lookup while freeing chunk.");
2722                 ret = -ENOENT;
2723                 goto out;
2724         }
2725
2726         ret = btrfs_del_item(trans, root, path);
2727         if (ret < 0)
2728                 btrfs_handle_fs_error(fs_info, ret,
2729                                       "Failed to delete chunk item.");
2730 out:
2731         btrfs_free_path(path);
2732         return ret;
2733 }
2734
2735 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2736 {
2737         struct btrfs_super_block *super_copy = fs_info->super_copy;
2738         struct btrfs_disk_key *disk_key;
2739         struct btrfs_chunk *chunk;
2740         u8 *ptr;
2741         int ret = 0;
2742         u32 num_stripes;
2743         u32 array_size;
2744         u32 len = 0;
2745         u32 cur;
2746         struct btrfs_key key;
2747
2748         mutex_lock(&fs_info->chunk_mutex);
2749         array_size = btrfs_super_sys_array_size(super_copy);
2750
2751         ptr = super_copy->sys_chunk_array;
2752         cur = 0;
2753
2754         while (cur < array_size) {
2755                 disk_key = (struct btrfs_disk_key *)ptr;
2756                 btrfs_disk_key_to_cpu(&key, disk_key);
2757
2758                 len = sizeof(*disk_key);
2759
2760                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2761                         chunk = (struct btrfs_chunk *)(ptr + len);
2762                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2763                         len += btrfs_chunk_item_size(num_stripes);
2764                 } else {
2765                         ret = -EIO;
2766                         break;
2767                 }
2768                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2769                     key.offset == chunk_offset) {
2770                         memmove(ptr, ptr + len, array_size - (cur + len));
2771                         array_size -= len;
2772                         btrfs_set_super_sys_array_size(super_copy, array_size);
2773                 } else {
2774                         ptr += len;
2775                         cur += len;
2776                 }
2777         }
2778         mutex_unlock(&fs_info->chunk_mutex);
2779         return ret;
2780 }
2781
2782 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2783                                         u64 logical, u64 length)
2784 {
2785         struct extent_map_tree *em_tree;
2786         struct extent_map *em;
2787
2788         em_tree = &fs_info->mapping_tree.map_tree;
2789         read_lock(&em_tree->lock);
2790         em = lookup_extent_mapping(em_tree, logical, length);
2791         read_unlock(&em_tree->lock);
2792
2793         if (!em) {
2794                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2795                            logical, length);
2796                 return ERR_PTR(-EINVAL);
2797         }
2798
2799         if (em->start > logical || em->start + em->len < logical) {
2800                 btrfs_crit(fs_info,
2801                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2802                            logical, length, em->start, em->start + em->len);
2803                 free_extent_map(em);
2804                 return ERR_PTR(-EINVAL);
2805         }
2806
2807         /* callers are responsible for dropping em's ref. */
2808         return em;
2809 }
2810
2811 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2812                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2813 {
2814         struct extent_map *em;
2815         struct map_lookup *map;
2816         u64 dev_extent_len = 0;
2817         int i, ret = 0;
2818         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2819
2820         em = get_chunk_map(fs_info, chunk_offset, 1);
2821         if (IS_ERR(em)) {
2822                 /*
2823                  * This is a logic error, but we don't want to just rely on the
2824                  * user having built with ASSERT enabled, so if ASSERT doesn't
2825                  * do anything we still error out.
2826                  */
2827                 ASSERT(0);
2828                 return PTR_ERR(em);
2829         }
2830         map = em->map_lookup;
2831         mutex_lock(&fs_info->chunk_mutex);
2832         check_system_chunk(trans, fs_info, map->type);
2833         mutex_unlock(&fs_info->chunk_mutex);
2834
2835         /*
2836          * Take the device list mutex to prevent races with the final phase of
2837          * a device replace operation that replaces the device object associated
2838          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2839          */
2840         mutex_lock(&fs_devices->device_list_mutex);
2841         for (i = 0; i < map->num_stripes; i++) {
2842                 struct btrfs_device *device = map->stripes[i].dev;
2843                 ret = btrfs_free_dev_extent(trans, device,
2844                                             map->stripes[i].physical,
2845                                             &dev_extent_len);
2846                 if (ret) {
2847                         mutex_unlock(&fs_devices->device_list_mutex);
2848                         btrfs_abort_transaction(trans, ret);
2849                         goto out;
2850                 }
2851
2852                 if (device->bytes_used > 0) {
2853                         mutex_lock(&fs_info->chunk_mutex);
2854                         btrfs_device_set_bytes_used(device,
2855                                         device->bytes_used - dev_extent_len);
2856                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2857                         btrfs_clear_space_info_full(fs_info);
2858                         mutex_unlock(&fs_info->chunk_mutex);
2859                 }
2860
2861                 if (map->stripes[i].dev) {
2862                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2863                         if (ret) {
2864                                 mutex_unlock(&fs_devices->device_list_mutex);
2865                                 btrfs_abort_transaction(trans, ret);
2866                                 goto out;
2867                         }
2868                 }
2869         }
2870         mutex_unlock(&fs_devices->device_list_mutex);
2871
2872         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2873         if (ret) {
2874                 btrfs_abort_transaction(trans, ret);
2875                 goto out;
2876         }
2877
2878         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2879
2880         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2881                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2882                 if (ret) {
2883                         btrfs_abort_transaction(trans, ret);
2884                         goto out;
2885                 }
2886         }
2887
2888         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2889         if (ret) {
2890                 btrfs_abort_transaction(trans, ret);
2891                 goto out;
2892         }
2893
2894 out:
2895         /* once for us */
2896         free_extent_map(em);
2897         return ret;
2898 }
2899
2900 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2901 {
2902         struct btrfs_root *root = fs_info->chunk_root;
2903         struct btrfs_trans_handle *trans;
2904         int ret;
2905
2906         /*
2907          * Prevent races with automatic removal of unused block groups.
2908          * After we relocate and before we remove the chunk with offset
2909          * chunk_offset, automatic removal of the block group can kick in,
2910          * resulting in a failure when calling btrfs_remove_chunk() below.
2911          *
2912          * Make sure to acquire this mutex before doing a tree search (dev
2913          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2914          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2915          * we release the path used to search the chunk/dev tree and before
2916          * the current task acquires this mutex and calls us.
2917          */
2918         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2919
2920         ret = btrfs_can_relocate(fs_info, chunk_offset);
2921         if (ret)
2922                 return -ENOSPC;
2923
2924         /* step one, relocate all the extents inside this chunk */
2925         btrfs_scrub_pause(fs_info);
2926         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2927         btrfs_scrub_continue(fs_info);
2928         if (ret)
2929                 return ret;
2930
2931         /*
2932          * We add the kobjects here (and after forcing data chunk creation)
2933          * since relocation is the only place we'll create chunks of a new
2934          * type at runtime.  The only place where we'll remove the last
2935          * chunk of a type is the call immediately below this one.  Even
2936          * so, we're protected against races with the cleaner thread since
2937          * we're covered by the delete_unused_bgs_mutex.
2938          */
2939         btrfs_add_raid_kobjects(fs_info);
2940
2941         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2942                                                      chunk_offset);
2943         if (IS_ERR(trans)) {
2944                 ret = PTR_ERR(trans);
2945                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2946                 return ret;
2947         }
2948
2949         /*
2950          * step two, delete the device extents and the
2951          * chunk tree entries
2952          */
2953         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2954         btrfs_end_transaction(trans);
2955         return ret;
2956 }
2957
2958 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2959 {
2960         struct btrfs_root *chunk_root = fs_info->chunk_root;
2961         struct btrfs_path *path;
2962         struct extent_buffer *leaf;
2963         struct btrfs_chunk *chunk;
2964         struct btrfs_key key;
2965         struct btrfs_key found_key;
2966         u64 chunk_type;
2967         bool retried = false;
2968         int failed = 0;
2969         int ret;
2970
2971         path = btrfs_alloc_path();
2972         if (!path)
2973                 return -ENOMEM;
2974
2975 again:
2976         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2977         key.offset = (u64)-1;
2978         key.type = BTRFS_CHUNK_ITEM_KEY;
2979
2980         while (1) {
2981                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2982                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2983                 if (ret < 0) {
2984                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2985                         goto error;
2986                 }
2987                 BUG_ON(ret == 0); /* Corruption */
2988
2989                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2990                                           key.type);
2991                 if (ret)
2992                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2993                 if (ret < 0)
2994                         goto error;
2995                 if (ret > 0)
2996                         break;
2997
2998                 leaf = path->nodes[0];
2999                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3000
3001                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3002                                        struct btrfs_chunk);
3003                 chunk_type = btrfs_chunk_type(leaf, chunk);
3004                 btrfs_release_path(path);
3005
3006                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3007                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3008                         if (ret == -ENOSPC)
3009                                 failed++;
3010                         else
3011                                 BUG_ON(ret);
3012                 }
3013                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3014
3015                 if (found_key.offset == 0)
3016                         break;
3017                 key.offset = found_key.offset - 1;
3018         }
3019         ret = 0;
3020         if (failed && !retried) {
3021                 failed = 0;
3022                 retried = true;
3023                 goto again;
3024         } else if (WARN_ON(failed && retried)) {
3025                 ret = -ENOSPC;
3026         }
3027 error:
3028         btrfs_free_path(path);
3029         return ret;
3030 }
3031
3032 /*
3033  * return 1 : allocate a data chunk successfully,
3034  * return <0: errors during allocating a data chunk,
3035  * return 0 : no need to allocate a data chunk.
3036  */
3037 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3038                                       u64 chunk_offset)
3039 {
3040         struct btrfs_block_group_cache *cache;
3041         u64 bytes_used;
3042         u64 chunk_type;
3043
3044         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3045         ASSERT(cache);
3046         chunk_type = cache->flags;
3047         btrfs_put_block_group(cache);
3048
3049         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3050                 spin_lock(&fs_info->data_sinfo->lock);
3051                 bytes_used = fs_info->data_sinfo->bytes_used;
3052                 spin_unlock(&fs_info->data_sinfo->lock);
3053
3054                 if (!bytes_used) {
3055                         struct btrfs_trans_handle *trans;
3056                         int ret;
3057
3058                         trans = btrfs_join_transaction(fs_info->tree_root);
3059                         if (IS_ERR(trans))
3060                                 return PTR_ERR(trans);
3061
3062                         ret = btrfs_force_chunk_alloc(trans, fs_info,
3063                                                       BTRFS_BLOCK_GROUP_DATA);
3064                         btrfs_end_transaction(trans);
3065                         if (ret < 0)
3066                                 return ret;
3067
3068                         btrfs_add_raid_kobjects(fs_info);
3069
3070                         return 1;
3071                 }
3072         }
3073         return 0;
3074 }
3075
3076 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3077                                struct btrfs_balance_control *bctl)
3078 {
3079         struct btrfs_root *root = fs_info->tree_root;
3080         struct btrfs_trans_handle *trans;
3081         struct btrfs_balance_item *item;
3082         struct btrfs_disk_balance_args disk_bargs;
3083         struct btrfs_path *path;
3084         struct extent_buffer *leaf;
3085         struct btrfs_key key;
3086         int ret, err;
3087
3088         path = btrfs_alloc_path();
3089         if (!path)
3090                 return -ENOMEM;
3091
3092         trans = btrfs_start_transaction(root, 0);
3093         if (IS_ERR(trans)) {
3094                 btrfs_free_path(path);
3095                 return PTR_ERR(trans);
3096         }
3097
3098         key.objectid = BTRFS_BALANCE_OBJECTID;
3099         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3100         key.offset = 0;
3101
3102         ret = btrfs_insert_empty_item(trans, root, path, &key,
3103                                       sizeof(*item));
3104         if (ret)
3105                 goto out;
3106
3107         leaf = path->nodes[0];
3108         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3109
3110         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3111
3112         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3113         btrfs_set_balance_data(leaf, item, &disk_bargs);
3114         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3115         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3116         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3117         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3118
3119         btrfs_set_balance_flags(leaf, item, bctl->flags);
3120
3121         btrfs_mark_buffer_dirty(leaf);
3122 out:
3123         btrfs_free_path(path);
3124         err = btrfs_commit_transaction(trans);
3125         if (err && !ret)
3126                 ret = err;
3127         return ret;
3128 }
3129
3130 static int del_balance_item(struct btrfs_fs_info *fs_info)
3131 {
3132         struct btrfs_root *root = fs_info->tree_root;
3133         struct btrfs_trans_handle *trans;
3134         struct btrfs_path *path;
3135         struct btrfs_key key;
3136         int ret, err;
3137
3138         path = btrfs_alloc_path();
3139         if (!path)
3140                 return -ENOMEM;
3141
3142         trans = btrfs_start_transaction(root, 0);
3143         if (IS_ERR(trans)) {
3144                 btrfs_free_path(path);
3145                 return PTR_ERR(trans);
3146         }
3147
3148         key.objectid = BTRFS_BALANCE_OBJECTID;
3149         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3150         key.offset = 0;
3151
3152         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3153         if (ret < 0)
3154                 goto out;
3155         if (ret > 0) {
3156                 ret = -ENOENT;
3157                 goto out;
3158         }
3159
3160         ret = btrfs_del_item(trans, root, path);
3161 out:
3162         btrfs_free_path(path);
3163         err = btrfs_commit_transaction(trans);
3164         if (err && !ret)
3165                 ret = err;
3166         return ret;
3167 }
3168
3169 /*
3170  * This is a heuristic used to reduce the number of chunks balanced on
3171  * resume after balance was interrupted.
3172  */
3173 static void update_balance_args(struct btrfs_balance_control *bctl)
3174 {
3175         /*
3176          * Turn on soft mode for chunk types that were being converted.
3177          */
3178         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3179                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3180         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3181                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3182         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3183                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3184
3185         /*
3186          * Turn on usage filter if is not already used.  The idea is
3187          * that chunks that we have already balanced should be
3188          * reasonably full.  Don't do it for chunks that are being
3189          * converted - that will keep us from relocating unconverted
3190          * (albeit full) chunks.
3191          */
3192         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3193             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3194             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3195                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3196                 bctl->data.usage = 90;
3197         }
3198         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3199             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3200             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3201                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3202                 bctl->sys.usage = 90;
3203         }
3204         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3205             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3206             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3207                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3208                 bctl->meta.usage = 90;
3209         }
3210 }
3211
3212 /*
3213  * Should be called with both balance and volume mutexes held to
3214  * serialize other volume operations (add_dev/rm_dev/resize) with
3215  * restriper.  Same goes for reset_balance_state.
3216  */
3217 static void set_balance_control(struct btrfs_balance_control *bctl)
3218 {
3219         struct btrfs_fs_info *fs_info = bctl->fs_info;
3220
3221         BUG_ON(fs_info->balance_ctl);
3222
3223         spin_lock(&fs_info->balance_lock);
3224         fs_info->balance_ctl = bctl;
3225         spin_unlock(&fs_info->balance_lock);
3226 }
3227
3228 /*
3229  * Clear the balance status in fs_info and delete the balance item from disk.
3230  */
3231 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3232 {
3233         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3234         int ret;
3235
3236         BUG_ON(!fs_info->balance_ctl);
3237
3238         spin_lock(&fs_info->balance_lock);
3239         fs_info->balance_ctl = NULL;
3240         spin_unlock(&fs_info->balance_lock);
3241
3242         kfree(bctl);
3243         ret = del_balance_item(fs_info);
3244         if (ret)
3245                 btrfs_handle_fs_error(fs_info, ret, NULL);
3246 }
3247
3248 /*
3249  * Balance filters.  Return 1 if chunk should be filtered out