29fc8a09dd2ea9622be49f4305856d46f37afe21
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include "ctree.h"
18 #include "extent_map.h"
19 #include "disk-io.h"
20 #include "transaction.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "async-thread.h"
25 #include "check-integrity.h"
26 #include "rcu-string.h"
27 #include "math.h"
28 #include "dev-replace.h"
29 #include "sysfs.h"
30
31 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
32         [BTRFS_RAID_RAID10] = {
33                 .sub_stripes    = 2,
34                 .dev_stripes    = 1,
35                 .devs_max       = 0,    /* 0 == as many as possible */
36                 .devs_min       = 4,
37                 .tolerated_failures = 1,
38                 .devs_increment = 2,
39                 .ncopies        = 2,
40                 .nparity        = 0,
41                 .raid_name      = "raid10",
42                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
43                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
44         },
45         [BTRFS_RAID_RAID1] = {
46                 .sub_stripes    = 1,
47                 .dev_stripes    = 1,
48                 .devs_max       = 2,
49                 .devs_min       = 2,
50                 .tolerated_failures = 1,
51                 .devs_increment = 2,
52                 .ncopies        = 2,
53                 .nparity        = 0,
54                 .raid_name      = "raid1",
55                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
56                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
57         },
58         [BTRFS_RAID_DUP] = {
59                 .sub_stripes    = 1,
60                 .dev_stripes    = 2,
61                 .devs_max       = 1,
62                 .devs_min       = 1,
63                 .tolerated_failures = 0,
64                 .devs_increment = 1,
65                 .ncopies        = 2,
66                 .nparity        = 0,
67                 .raid_name      = "dup",
68                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
69                 .mindev_error   = 0,
70         },
71         [BTRFS_RAID_RAID0] = {
72                 .sub_stripes    = 1,
73                 .dev_stripes    = 1,
74                 .devs_max       = 0,
75                 .devs_min       = 2,
76                 .tolerated_failures = 0,
77                 .devs_increment = 1,
78                 .ncopies        = 1,
79                 .nparity        = 0,
80                 .raid_name      = "raid0",
81                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
82                 .mindev_error   = 0,
83         },
84         [BTRFS_RAID_SINGLE] = {
85                 .sub_stripes    = 1,
86                 .dev_stripes    = 1,
87                 .devs_max       = 1,
88                 .devs_min       = 1,
89                 .tolerated_failures = 0,
90                 .devs_increment = 1,
91                 .ncopies        = 1,
92                 .nparity        = 0,
93                 .raid_name      = "single",
94                 .bg_flag        = 0,
95                 .mindev_error   = 0,
96         },
97         [BTRFS_RAID_RAID5] = {
98                 .sub_stripes    = 1,
99                 .dev_stripes    = 1,
100                 .devs_max       = 0,
101                 .devs_min       = 2,
102                 .tolerated_failures = 1,
103                 .devs_increment = 1,
104                 .ncopies        = 1,
105                 .nparity        = 1,
106                 .raid_name      = "raid5",
107                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
108                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
109         },
110         [BTRFS_RAID_RAID6] = {
111                 .sub_stripes    = 1,
112                 .dev_stripes    = 1,
113                 .devs_max       = 0,
114                 .devs_min       = 3,
115                 .tolerated_failures = 2,
116                 .devs_increment = 1,
117                 .ncopies        = 1,
118                 .nparity        = 2,
119                 .raid_name      = "raid6",
120                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
121                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
122         },
123 };
124
125 const char *get_raid_name(enum btrfs_raid_types type)
126 {
127         if (type >= BTRFS_NR_RAID_TYPES)
128                 return NULL;
129
130         return btrfs_raid_array[type].raid_name;
131 }
132
133 static int init_first_rw_device(struct btrfs_trans_handle *trans,
134                                 struct btrfs_fs_info *fs_info);
135 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
136 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
137 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
138 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
139 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
140                              enum btrfs_map_op op,
141                              u64 logical, u64 *length,
142                              struct btrfs_bio **bbio_ret,
143                              int mirror_num, int need_raid_map);
144
145 /*
146  * Device locking
147  * ==============
148  *
149  * There are several mutexes that protect manipulation of devices and low-level
150  * structures like chunks but not block groups, extents or files
151  *
152  * uuid_mutex (global lock)
153  * ------------------------
154  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
155  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
156  * device) or requested by the device= mount option
157  *
158  * the mutex can be very coarse and can cover long-running operations
159  *
160  * protects: updates to fs_devices counters like missing devices, rw devices,
161  * seeding, structure cloning, openning/closing devices at mount/umount time
162  *
163  * global::fs_devs - add, remove, updates to the global list
164  *
165  * does not protect: manipulation of the fs_devices::devices list!
166  *
167  * btrfs_device::name - renames (write side), read is RCU
168  *
169  * fs_devices::device_list_mutex (per-fs, with RCU)
170  * ------------------------------------------------
171  * protects updates to fs_devices::devices, ie. adding and deleting
172  *
173  * simple list traversal with read-only actions can be done with RCU protection
174  *
175  * may be used to exclude some operations from running concurrently without any
176  * modifications to the list (see write_all_supers)
177  *
178  * balance_mutex
179  * -------------
180  * protects balance structures (status, state) and context accessed from
181  * several places (internally, ioctl)
182  *
183  * chunk_mutex
184  * -----------
185  * protects chunks, adding or removing during allocation, trim or when a new
186  * device is added/removed
187  *
188  * cleaner_mutex
189  * -------------
190  * a big lock that is held by the cleaner thread and prevents running subvolume
191  * cleaning together with relocation or delayed iputs
192  *
193  *
194  * Lock nesting
195  * ============
196  *
197  * uuid_mutex
198  *   volume_mutex
199  *     device_list_mutex
200  *       chunk_mutex
201  *     balance_mutex
202  *
203  *
204  * Exclusive operations, BTRFS_FS_EXCL_OP
205  * ======================================
206  *
207  * Maintains the exclusivity of the following operations that apply to the
208  * whole filesystem and cannot run in parallel.
209  *
210  * - Balance (*)
211  * - Device add
212  * - Device remove
213  * - Device replace (*)
214  * - Resize
215  *
216  * The device operations (as above) can be in one of the following states:
217  *
218  * - Running state
219  * - Paused state
220  * - Completed state
221  *
222  * Only device operations marked with (*) can go into the Paused state for the
223  * following reasons:
224  *
225  * - ioctl (only Balance can be Paused through ioctl)
226  * - filesystem remounted as read-only
227  * - filesystem unmounted and mounted as read-only
228  * - system power-cycle and filesystem mounted as read-only
229  * - filesystem or device errors leading to forced read-only
230  *
231  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
232  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
233  * A device operation in Paused or Running state can be canceled or resumed
234  * either by ioctl (Balance only) or when remounted as read-write.
235  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
236  * completed.
237  */
238
239 DEFINE_MUTEX(uuid_mutex);
240 static LIST_HEAD(fs_uuids);
241 struct list_head *btrfs_get_fs_uuids(void)
242 {
243         return &fs_uuids;
244 }
245
246 /*
247  * alloc_fs_devices - allocate struct btrfs_fs_devices
248  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
249  *
250  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
251  * The returned struct is not linked onto any lists and can be destroyed with
252  * kfree() right away.
253  */
254 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
255 {
256         struct btrfs_fs_devices *fs_devs;
257
258         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
259         if (!fs_devs)
260                 return ERR_PTR(-ENOMEM);
261
262         mutex_init(&fs_devs->device_list_mutex);
263
264         INIT_LIST_HEAD(&fs_devs->devices);
265         INIT_LIST_HEAD(&fs_devs->resized_devices);
266         INIT_LIST_HEAD(&fs_devs->alloc_list);
267         INIT_LIST_HEAD(&fs_devs->fs_list);
268         if (fsid)
269                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
270
271         return fs_devs;
272 }
273
274 void btrfs_free_device(struct btrfs_device *device)
275 {
276         rcu_string_free(device->name);
277         bio_put(device->flush_bio);
278         kfree(device);
279 }
280
281 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
282 {
283         struct btrfs_device *device;
284         WARN_ON(fs_devices->opened);
285         while (!list_empty(&fs_devices->devices)) {
286                 device = list_entry(fs_devices->devices.next,
287                                     struct btrfs_device, dev_list);
288                 list_del(&device->dev_list);
289                 btrfs_free_device(device);
290         }
291         kfree(fs_devices);
292 }
293
294 static void btrfs_kobject_uevent(struct block_device *bdev,
295                                  enum kobject_action action)
296 {
297         int ret;
298
299         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
300         if (ret)
301                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
302                         action,
303                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
304                         &disk_to_dev(bdev->bd_disk)->kobj);
305 }
306
307 void __exit btrfs_cleanup_fs_uuids(void)
308 {
309         struct btrfs_fs_devices *fs_devices;
310
311         while (!list_empty(&fs_uuids)) {
312                 fs_devices = list_entry(fs_uuids.next,
313                                         struct btrfs_fs_devices, fs_list);
314                 list_del(&fs_devices->fs_list);
315                 free_fs_devices(fs_devices);
316         }
317 }
318
319 /*
320  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
321  * Returned struct is not linked onto any lists and must be destroyed using
322  * btrfs_free_device.
323  */
324 static struct btrfs_device *__alloc_device(void)
325 {
326         struct btrfs_device *dev;
327
328         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
329         if (!dev)
330                 return ERR_PTR(-ENOMEM);
331
332         /*
333          * Preallocate a bio that's always going to be used for flushing device
334          * barriers and matches the device lifespan
335          */
336         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
337         if (!dev->flush_bio) {
338                 kfree(dev);
339                 return ERR_PTR(-ENOMEM);
340         }
341
342         INIT_LIST_HEAD(&dev->dev_list);
343         INIT_LIST_HEAD(&dev->dev_alloc_list);
344         INIT_LIST_HEAD(&dev->resized_list);
345
346         spin_lock_init(&dev->io_lock);
347
348         atomic_set(&dev->reada_in_flight, 0);
349         atomic_set(&dev->dev_stats_ccnt, 0);
350         btrfs_device_data_ordered_init(dev);
351         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
352         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
353
354         return dev;
355 }
356
357 /*
358  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
359  * return NULL.
360  *
361  * If devid and uuid are both specified, the match must be exact, otherwise
362  * only devid is used.
363  */
364 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
365                 u64 devid, const u8 *uuid)
366 {
367         struct btrfs_device *dev;
368
369         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
370                 if (dev->devid == devid &&
371                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
372                         return dev;
373                 }
374         }
375         return NULL;
376 }
377
378 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
379 {
380         struct btrfs_fs_devices *fs_devices;
381
382         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
383                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
384                         return fs_devices;
385         }
386         return NULL;
387 }
388
389 static int
390 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
391                       int flush, struct block_device **bdev,
392                       struct buffer_head **bh)
393 {
394         int ret;
395
396         *bdev = blkdev_get_by_path(device_path, flags, holder);
397
398         if (IS_ERR(*bdev)) {
399                 ret = PTR_ERR(*bdev);
400                 goto error;
401         }
402
403         if (flush)
404                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
405         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
406         if (ret) {
407                 blkdev_put(*bdev, flags);
408                 goto error;
409         }
410         invalidate_bdev(*bdev);
411         *bh = btrfs_read_dev_super(*bdev);
412         if (IS_ERR(*bh)) {
413                 ret = PTR_ERR(*bh);
414                 blkdev_put(*bdev, flags);
415                 goto error;
416         }
417
418         return 0;
419
420 error:
421         *bdev = NULL;
422         *bh = NULL;
423         return ret;
424 }
425
426 static void requeue_list(struct btrfs_pending_bios *pending_bios,
427                         struct bio *head, struct bio *tail)
428 {
429
430         struct bio *old_head;
431
432         old_head = pending_bios->head;
433         pending_bios->head = head;
434         if (pending_bios->tail)
435                 tail->bi_next = old_head;
436         else
437                 pending_bios->tail = tail;
438 }
439
440 /*
441  * we try to collect pending bios for a device so we don't get a large
442  * number of procs sending bios down to the same device.  This greatly
443  * improves the schedulers ability to collect and merge the bios.
444  *
445  * But, it also turns into a long list of bios to process and that is sure
446  * to eventually make the worker thread block.  The solution here is to
447  * make some progress and then put this work struct back at the end of
448  * the list if the block device is congested.  This way, multiple devices
449  * can make progress from a single worker thread.
450  */
451 static noinline void run_scheduled_bios(struct btrfs_device *device)
452 {
453         struct btrfs_fs_info *fs_info = device->fs_info;
454         struct bio *pending;
455         struct backing_dev_info *bdi;
456         struct btrfs_pending_bios *pending_bios;
457         struct bio *tail;
458         struct bio *cur;
459         int again = 0;
460         unsigned long num_run;
461         unsigned long batch_run = 0;
462         unsigned long last_waited = 0;
463         int force_reg = 0;
464         int sync_pending = 0;
465         struct blk_plug plug;
466
467         /*
468          * this function runs all the bios we've collected for
469          * a particular device.  We don't want to wander off to
470          * another device without first sending all of these down.
471          * So, setup a plug here and finish it off before we return
472          */
473         blk_start_plug(&plug);
474
475         bdi = device->bdev->bd_bdi;
476
477 loop:
478         spin_lock(&device->io_lock);
479
480 loop_lock:
481         num_run = 0;
482
483         /* take all the bios off the list at once and process them
484          * later on (without the lock held).  But, remember the
485          * tail and other pointers so the bios can be properly reinserted
486          * into the list if we hit congestion
487          */
488         if (!force_reg && device->pending_sync_bios.head) {
489                 pending_bios = &device->pending_sync_bios;
490                 force_reg = 1;
491         } else {
492                 pending_bios = &device->pending_bios;
493                 force_reg = 0;
494         }
495
496         pending = pending_bios->head;
497         tail = pending_bios->tail;
498         WARN_ON(pending && !tail);
499
500         /*
501          * if pending was null this time around, no bios need processing
502          * at all and we can stop.  Otherwise it'll loop back up again
503          * and do an additional check so no bios are missed.
504          *
505          * device->running_pending is used to synchronize with the
506          * schedule_bio code.
507          */
508         if (device->pending_sync_bios.head == NULL &&
509             device->pending_bios.head == NULL) {
510                 again = 0;
511                 device->running_pending = 0;
512         } else {
513                 again = 1;
514                 device->running_pending = 1;
515         }
516
517         pending_bios->head = NULL;
518         pending_bios->tail = NULL;
519
520         spin_unlock(&device->io_lock);
521
522         while (pending) {
523
524                 rmb();
525                 /* we want to work on both lists, but do more bios on the
526                  * sync list than the regular list
527                  */
528                 if ((num_run > 32 &&
529                     pending_bios != &device->pending_sync_bios &&
530                     device->pending_sync_bios.head) ||
531                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
532                     device->pending_bios.head)) {
533                         spin_lock(&device->io_lock);
534                         requeue_list(pending_bios, pending, tail);
535                         goto loop_lock;
536                 }
537
538                 cur = pending;
539                 pending = pending->bi_next;
540                 cur->bi_next = NULL;
541
542                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
543
544                 /*
545                  * if we're doing the sync list, record that our
546                  * plug has some sync requests on it
547                  *
548                  * If we're doing the regular list and there are
549                  * sync requests sitting around, unplug before
550                  * we add more
551                  */
552                 if (pending_bios == &device->pending_sync_bios) {
553                         sync_pending = 1;
554                 } else if (sync_pending) {
555                         blk_finish_plug(&plug);
556                         blk_start_plug(&plug);
557                         sync_pending = 0;
558                 }
559
560                 btrfsic_submit_bio(cur);
561                 num_run++;
562                 batch_run++;
563
564                 cond_resched();
565
566                 /*
567                  * we made progress, there is more work to do and the bdi
568                  * is now congested.  Back off and let other work structs
569                  * run instead
570                  */
571                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
572                     fs_info->fs_devices->open_devices > 1) {
573                         struct io_context *ioc;
574
575                         ioc = current->io_context;
576
577                         /*
578                          * the main goal here is that we don't want to
579                          * block if we're going to be able to submit
580                          * more requests without blocking.
581                          *
582                          * This code does two great things, it pokes into
583                          * the elevator code from a filesystem _and_
584                          * it makes assumptions about how batching works.
585                          */
586                         if (ioc && ioc->nr_batch_requests > 0 &&
587                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
588                             (last_waited == 0 ||
589                              ioc->last_waited == last_waited)) {
590                                 /*
591                                  * we want to go through our batch of
592                                  * requests and stop.  So, we copy out
593                                  * the ioc->last_waited time and test
594                                  * against it before looping
595                                  */
596                                 last_waited = ioc->last_waited;
597                                 cond_resched();
598                                 continue;
599                         }
600                         spin_lock(&device->io_lock);
601                         requeue_list(pending_bios, pending, tail);
602                         device->running_pending = 1;
603
604                         spin_unlock(&device->io_lock);
605                         btrfs_queue_work(fs_info->submit_workers,
606                                          &device->work);
607                         goto done;
608                 }
609         }
610
611         cond_resched();
612         if (again)
613                 goto loop;
614
615         spin_lock(&device->io_lock);
616         if (device->pending_bios.head || device->pending_sync_bios.head)
617                 goto loop_lock;
618         spin_unlock(&device->io_lock);
619
620 done:
621         blk_finish_plug(&plug);
622 }
623
624 static void pending_bios_fn(struct btrfs_work *work)
625 {
626         struct btrfs_device *device;
627
628         device = container_of(work, struct btrfs_device, work);
629         run_scheduled_bios(device);
630 }
631
632 /*
633  *  Search and remove all stale (devices which are not mounted) devices.
634  *  When both inputs are NULL, it will search and release all stale devices.
635  *  path:       Optional. When provided will it release all unmounted devices
636  *              matching this path only.
637  *  skip_dev:   Optional. Will skip this device when searching for the stale
638  *              devices.
639  */
640 static void btrfs_free_stale_devices(const char *path,
641                                      struct btrfs_device *skip_device)
642 {
643         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
644         struct btrfs_device *device, *tmp_device;
645
646         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
647                 mutex_lock(&fs_devices->device_list_mutex);
648                 if (fs_devices->opened) {
649                         mutex_unlock(&fs_devices->device_list_mutex);
650                         continue;
651                 }
652
653                 list_for_each_entry_safe(device, tmp_device,
654                                          &fs_devices->devices, dev_list) {
655                         int not_found = 0;
656
657                         if (skip_device && skip_device == device)
658                                 continue;
659                         if (path && !device->name)
660                                 continue;
661
662                         rcu_read_lock();
663                         if (path)
664                                 not_found = strcmp(rcu_str_deref(device->name),
665                                                    path);
666                         rcu_read_unlock();
667                         if (not_found)
668                                 continue;
669
670                         /* delete the stale device */
671                         fs_devices->num_devices--;
672                         list_del(&device->dev_list);
673                         btrfs_free_device(device);
674
675                         if (fs_devices->num_devices == 0)
676                                 break;
677                 }
678                 mutex_unlock(&fs_devices->device_list_mutex);
679                 if (fs_devices->num_devices == 0) {
680                         btrfs_sysfs_remove_fsid(fs_devices);
681                         list_del(&fs_devices->fs_list);
682                         free_fs_devices(fs_devices);
683                 }
684         }
685 }
686
687 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
688                         struct btrfs_device *device, fmode_t flags,
689                         void *holder)
690 {
691         struct request_queue *q;
692         struct block_device *bdev;
693         struct buffer_head *bh;
694         struct btrfs_super_block *disk_super;
695         u64 devid;
696         int ret;
697
698         if (device->bdev)
699                 return -EINVAL;
700         if (!device->name)
701                 return -EINVAL;
702
703         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
704                                     &bdev, &bh);
705         if (ret)
706                 return ret;
707
708         disk_super = (struct btrfs_super_block *)bh->b_data;
709         devid = btrfs_stack_device_id(&disk_super->dev_item);
710         if (devid != device->devid)
711                 goto error_brelse;
712
713         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
714                 goto error_brelse;
715
716         device->generation = btrfs_super_generation(disk_super);
717
718         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
719                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
720                 fs_devices->seeding = 1;
721         } else {
722                 if (bdev_read_only(bdev))
723                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
724                 else
725                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
726         }
727
728         q = bdev_get_queue(bdev);
729         if (!blk_queue_nonrot(q))
730                 fs_devices->rotating = 1;
731
732         device->bdev = bdev;
733         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
734         device->mode = flags;
735
736         fs_devices->open_devices++;
737         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
738             device->devid != BTRFS_DEV_REPLACE_DEVID) {
739                 fs_devices->rw_devices++;
740                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
741         }
742         brelse(bh);
743
744         return 0;
745
746 error_brelse:
747         brelse(bh);
748         blkdev_put(bdev, flags);
749
750         return -EINVAL;
751 }
752
753 /*
754  * Add new device to list of registered devices
755  *
756  * Returns:
757  * device pointer which was just added or updated when successful
758  * error pointer when failed
759  */
760 static noinline struct btrfs_device *device_list_add(const char *path,
761                            struct btrfs_super_block *disk_super,
762                            bool *new_device_added)
763 {
764         struct btrfs_device *device;
765         struct btrfs_fs_devices *fs_devices;
766         struct rcu_string *name;
767         u64 found_transid = btrfs_super_generation(disk_super);
768         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
769
770         fs_devices = find_fsid(disk_super->fsid);
771         if (!fs_devices) {
772                 fs_devices = alloc_fs_devices(disk_super->fsid);
773                 if (IS_ERR(fs_devices))
774                         return ERR_CAST(fs_devices);
775
776                 mutex_lock(&fs_devices->device_list_mutex);
777                 list_add(&fs_devices->fs_list, &fs_uuids);
778
779                 device = NULL;
780         } else {
781                 mutex_lock(&fs_devices->device_list_mutex);
782                 device = find_device(fs_devices, devid,
783                                 disk_super->dev_item.uuid);
784         }
785
786         if (!device) {
787                 if (fs_devices->opened) {
788                         mutex_unlock(&fs_devices->device_list_mutex);
789                         return ERR_PTR(-EBUSY);
790                 }
791
792                 device = btrfs_alloc_device(NULL, &devid,
793                                             disk_super->dev_item.uuid);
794                 if (IS_ERR(device)) {
795                         mutex_unlock(&fs_devices->device_list_mutex);
796                         /* we can safely leave the fs_devices entry around */
797                         return device;
798                 }
799
800                 name = rcu_string_strdup(path, GFP_NOFS);
801                 if (!name) {
802                         btrfs_free_device(device);
803                         mutex_unlock(&fs_devices->device_list_mutex);
804                         return ERR_PTR(-ENOMEM);
805                 }
806                 rcu_assign_pointer(device->name, name);
807
808                 list_add_rcu(&device->dev_list, &fs_devices->devices);
809                 fs_devices->num_devices++;
810
811                 device->fs_devices = fs_devices;
812                 *new_device_added = true;
813
814                 if (disk_super->label[0])
815                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
816                                 disk_super->label, devid, found_transid, path);
817                 else
818                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
819                                 disk_super->fsid, devid, found_transid, path);
820
821         } else if (!device->name || strcmp(device->name->str, path)) {
822                 /*
823                  * When FS is already mounted.
824                  * 1. If you are here and if the device->name is NULL that
825                  *    means this device was missing at time of FS mount.
826                  * 2. If you are here and if the device->name is different
827                  *    from 'path' that means either
828                  *      a. The same device disappeared and reappeared with
829                  *         different name. or
830                  *      b. The missing-disk-which-was-replaced, has
831                  *         reappeared now.
832                  *
833                  * We must allow 1 and 2a above. But 2b would be a spurious
834                  * and unintentional.
835                  *
836                  * Further in case of 1 and 2a above, the disk at 'path'
837                  * would have missed some transaction when it was away and
838                  * in case of 2a the stale bdev has to be updated as well.
839                  * 2b must not be allowed at all time.
840                  */
841
842                 /*
843                  * For now, we do allow update to btrfs_fs_device through the
844                  * btrfs dev scan cli after FS has been mounted.  We're still
845                  * tracking a problem where systems fail mount by subvolume id
846                  * when we reject replacement on a mounted FS.
847                  */
848                 if (!fs_devices->opened && found_transid < device->generation) {
849                         /*
850                          * That is if the FS is _not_ mounted and if you
851                          * are here, that means there is more than one
852                          * disk with same uuid and devid.We keep the one
853                          * with larger generation number or the last-in if
854                          * generation are equal.
855                          */
856                         mutex_unlock(&fs_devices->device_list_mutex);
857                         return ERR_PTR(-EEXIST);
858                 }
859
860                 name = rcu_string_strdup(path, GFP_NOFS);
861                 if (!name) {
862                         mutex_unlock(&fs_devices->device_list_mutex);
863                         return ERR_PTR(-ENOMEM);
864                 }
865                 rcu_string_free(device->name);
866                 rcu_assign_pointer(device->name, name);
867                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
868                         fs_devices->missing_devices--;
869                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
870                 }
871         }
872
873         /*
874          * Unmount does not free the btrfs_device struct but would zero
875          * generation along with most of the other members. So just update
876          * it back. We need it to pick the disk with largest generation
877          * (as above).
878          */
879         if (!fs_devices->opened)
880                 device->generation = found_transid;
881
882         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
883
884         mutex_unlock(&fs_devices->device_list_mutex);
885         return device;
886 }
887
888 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
889 {
890         struct btrfs_fs_devices *fs_devices;
891         struct btrfs_device *device;
892         struct btrfs_device *orig_dev;
893
894         fs_devices = alloc_fs_devices(orig->fsid);
895         if (IS_ERR(fs_devices))
896                 return fs_devices;
897
898         mutex_lock(&orig->device_list_mutex);
899         fs_devices->total_devices = orig->total_devices;
900
901         /* We have held the volume lock, it is safe to get the devices. */
902         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
903                 struct rcu_string *name;
904
905                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
906                                             orig_dev->uuid);
907                 if (IS_ERR(device))
908                         goto error;
909
910                 /*
911                  * This is ok to do without rcu read locked because we hold the
912                  * uuid mutex so nothing we touch in here is going to disappear.
913                  */
914                 if (orig_dev->name) {
915                         name = rcu_string_strdup(orig_dev->name->str,
916                                         GFP_KERNEL);
917                         if (!name) {
918                                 btrfs_free_device(device);
919                                 goto error;
920                         }
921                         rcu_assign_pointer(device->name, name);
922                 }
923
924                 list_add(&device->dev_list, &fs_devices->devices);
925                 device->fs_devices = fs_devices;
926                 fs_devices->num_devices++;
927         }
928         mutex_unlock(&orig->device_list_mutex);
929         return fs_devices;
930 error:
931         mutex_unlock(&orig->device_list_mutex);
932         free_fs_devices(fs_devices);
933         return ERR_PTR(-ENOMEM);
934 }
935
936 /*
937  * After we have read the system tree and know devids belonging to
938  * this filesystem, remove the device which does not belong there.
939  */
940 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
941 {
942         struct btrfs_device *device, *next;
943         struct btrfs_device *latest_dev = NULL;
944
945         mutex_lock(&uuid_mutex);
946 again:
947         /* This is the initialized path, it is safe to release the devices. */
948         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
949                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
950                                                         &device->dev_state)) {
951                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
952                              &device->dev_state) &&
953                              (!latest_dev ||
954                               device->generation > latest_dev->generation)) {
955                                 latest_dev = device;
956                         }
957                         continue;
958                 }
959
960                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
961                         /*
962                          * In the first step, keep the device which has
963                          * the correct fsid and the devid that is used
964                          * for the dev_replace procedure.
965                          * In the second step, the dev_replace state is
966                          * read from the device tree and it is known
967                          * whether the procedure is really active or
968                          * not, which means whether this device is
969                          * used or whether it should be removed.
970                          */
971                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
972                                                   &device->dev_state)) {
973                                 continue;
974                         }
975                 }
976                 if (device->bdev) {
977                         blkdev_put(device->bdev, device->mode);
978                         device->bdev = NULL;
979                         fs_devices->open_devices--;
980                 }
981                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
982                         list_del_init(&device->dev_alloc_list);
983                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
984                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
985                                       &device->dev_state))
986                                 fs_devices->rw_devices--;
987                 }
988                 list_del_init(&device->dev_list);
989                 fs_devices->num_devices--;
990                 btrfs_free_device(device);
991         }
992
993         if (fs_devices->seed) {
994                 fs_devices = fs_devices->seed;
995                 goto again;
996         }
997
998         fs_devices->latest_bdev = latest_dev->bdev;
999
1000         mutex_unlock(&uuid_mutex);
1001 }
1002
1003 static void free_device_rcu(struct rcu_head *head)
1004 {
1005         struct btrfs_device *device;
1006
1007         device = container_of(head, struct btrfs_device, rcu);
1008         btrfs_free_device(device);
1009 }
1010
1011 static void btrfs_close_bdev(struct btrfs_device *device)
1012 {
1013         if (!device->bdev)
1014                 return;
1015
1016         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1017                 sync_blockdev(device->bdev);
1018                 invalidate_bdev(device->bdev);
1019         }
1020
1021         blkdev_put(device->bdev, device->mode);
1022 }
1023
1024 static void btrfs_close_one_device(struct btrfs_device *device)
1025 {
1026         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1027         struct btrfs_device *new_device;
1028         struct rcu_string *name;
1029
1030         if (device->bdev)
1031                 fs_devices->open_devices--;
1032
1033         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1034             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1035                 list_del_init(&device->dev_alloc_list);
1036                 fs_devices->rw_devices--;
1037         }
1038
1039         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1040                 fs_devices->missing_devices--;
1041
1042         btrfs_close_bdev(device);
1043
1044         new_device = btrfs_alloc_device(NULL, &device->devid,
1045                                         device->uuid);
1046         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1047
1048         /* Safe because we are under uuid_mutex */
1049         if (device->name) {
1050                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1051                 BUG_ON(!name); /* -ENOMEM */
1052                 rcu_assign_pointer(new_device->name, name);
1053         }
1054
1055         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1056         new_device->fs_devices = device->fs_devices;
1057
1058         call_rcu(&device->rcu, free_device_rcu);
1059 }
1060
1061 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1062 {
1063         struct btrfs_device *device, *tmp;
1064
1065         if (--fs_devices->opened > 0)
1066                 return 0;
1067
1068         mutex_lock(&fs_devices->device_list_mutex);
1069         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1070                 btrfs_close_one_device(device);
1071         }
1072         mutex_unlock(&fs_devices->device_list_mutex);
1073
1074         WARN_ON(fs_devices->open_devices);
1075         WARN_ON(fs_devices->rw_devices);
1076         fs_devices->opened = 0;
1077         fs_devices->seeding = 0;
1078
1079         return 0;
1080 }
1081
1082 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1083 {
1084         struct btrfs_fs_devices *seed_devices = NULL;
1085         int ret;
1086
1087         mutex_lock(&uuid_mutex);
1088         ret = close_fs_devices(fs_devices);
1089         if (!fs_devices->opened) {
1090                 seed_devices = fs_devices->seed;
1091                 fs_devices->seed = NULL;
1092         }
1093         mutex_unlock(&uuid_mutex);
1094
1095         while (seed_devices) {
1096                 fs_devices = seed_devices;
1097                 seed_devices = fs_devices->seed;
1098                 close_fs_devices(fs_devices);
1099                 free_fs_devices(fs_devices);
1100         }
1101         return ret;
1102 }
1103
1104 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1105                                 fmode_t flags, void *holder)
1106 {
1107         struct btrfs_device *device;
1108         struct btrfs_device *latest_dev = NULL;
1109         int ret = 0;
1110
1111         flags |= FMODE_EXCL;
1112
1113         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1114                 /* Just open everything we can; ignore failures here */
1115                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1116                         continue;
1117
1118                 if (!latest_dev ||
1119                     device->generation > latest_dev->generation)
1120                         latest_dev = device;
1121         }
1122         if (fs_devices->open_devices == 0) {
1123                 ret = -EINVAL;
1124                 goto out;
1125         }
1126         fs_devices->opened = 1;
1127         fs_devices->latest_bdev = latest_dev->bdev;
1128         fs_devices->total_rw_bytes = 0;
1129 out:
1130         return ret;
1131 }
1132
1133 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1134 {
1135         struct btrfs_device *dev1, *dev2;
1136
1137         dev1 = list_entry(a, struct btrfs_device, dev_list);
1138         dev2 = list_entry(b, struct btrfs_device, dev_list);
1139
1140         if (dev1->devid < dev2->devid)
1141                 return -1;
1142         else if (dev1->devid > dev2->devid)
1143                 return 1;
1144         return 0;
1145 }
1146
1147 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1148                        fmode_t flags, void *holder)
1149 {
1150         int ret;
1151
1152         lockdep_assert_held(&uuid_mutex);
1153
1154         mutex_lock(&fs_devices->device_list_mutex);
1155         if (fs_devices->opened) {
1156                 fs_devices->opened++;
1157                 ret = 0;
1158         } else {
1159                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1160                 ret = open_fs_devices(fs_devices, flags, holder);
1161         }
1162         mutex_unlock(&fs_devices->device_list_mutex);
1163
1164         return ret;
1165 }
1166
1167 static void btrfs_release_disk_super(struct page *page)
1168 {
1169         kunmap(page);
1170         put_page(page);
1171 }
1172
1173 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1174                                  struct page **page,
1175                                  struct btrfs_super_block **disk_super)
1176 {
1177         void *p;
1178         pgoff_t index;
1179
1180         /* make sure our super fits in the device */
1181         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1182                 return 1;
1183
1184         /* make sure our super fits in the page */
1185         if (sizeof(**disk_super) > PAGE_SIZE)
1186                 return 1;
1187
1188         /* make sure our super doesn't straddle pages on disk */
1189         index = bytenr >> PAGE_SHIFT;
1190         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1191                 return 1;
1192
1193         /* pull in the page with our super */
1194         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1195                                    index, GFP_KERNEL);
1196
1197         if (IS_ERR_OR_NULL(*page))
1198                 return 1;
1199
1200         p = kmap(*page);
1201
1202         /* align our pointer to the offset of the super block */
1203         *disk_super = p + (bytenr & ~PAGE_MASK);
1204
1205         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1206             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1207                 btrfs_release_disk_super(*page);
1208                 return 1;
1209         }
1210
1211         if ((*disk_super)->label[0] &&
1212                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1213                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1214
1215         return 0;
1216 }
1217
1218 /*
1219  * Look for a btrfs signature on a device. This may be called out of the mount path
1220  * and we are not allowed to call set_blocksize during the scan. The superblock
1221  * is read via pagecache
1222  */
1223 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1224                                            void *holder)
1225 {
1226         struct btrfs_super_block *disk_super;
1227         bool new_device_added = false;
1228         struct btrfs_device *device = NULL;
1229         struct block_device *bdev;
1230         struct page *page;
1231         u64 bytenr;
1232
1233         lockdep_assert_held(&uuid_mutex);
1234
1235         /*
1236          * we would like to check all the supers, but that would make
1237          * a btrfs mount succeed after a mkfs from a different FS.
1238          * So, we need to add a special mount option to scan for
1239          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1240          */
1241         bytenr = btrfs_sb_offset(0);
1242         flags |= FMODE_EXCL;
1243
1244         bdev = blkdev_get_by_path(path, flags, holder);
1245         if (IS_ERR(bdev))
1246                 return ERR_CAST(bdev);
1247
1248         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1249                 device = ERR_PTR(-EINVAL);
1250                 goto error_bdev_put;
1251         }
1252
1253         device = device_list_add(path, disk_super, &new_device_added);
1254         if (!IS_ERR(device)) {
1255                 if (new_device_added)
1256                         btrfs_free_stale_devices(path, device);
1257         }
1258
1259         btrfs_release_disk_super(page);
1260
1261 error_bdev_put:
1262         blkdev_put(bdev, flags);
1263
1264         return device;
1265 }
1266
1267 static int contains_pending_extent(struct btrfs_transaction *transaction,
1268                                    struct btrfs_device *device,
1269                                    u64 *start, u64 len)
1270 {
1271         struct btrfs_fs_info *fs_info = device->fs_info;
1272         struct extent_map *em;
1273         struct list_head *search_list = &fs_info->pinned_chunks;
1274         int ret = 0;
1275         u64 physical_start = *start;
1276
1277         if (transaction)
1278                 search_list = &transaction->pending_chunks;
1279 again:
1280         list_for_each_entry(em, search_list, list) {
1281                 struct map_lookup *map;
1282                 int i;
1283
1284                 map = em->map_lookup;
1285                 for (i = 0; i < map->num_stripes; i++) {
1286                         u64 end;
1287
1288                         if (map->stripes[i].dev != device)
1289                                 continue;
1290                         if (map->stripes[i].physical >= physical_start + len ||
1291                             map->stripes[i].physical + em->orig_block_len <=
1292                             physical_start)
1293                                 continue;
1294                         /*
1295                          * Make sure that while processing the pinned list we do
1296                          * not override our *start with a lower value, because
1297                          * we can have pinned chunks that fall within this
1298                          * device hole and that have lower physical addresses
1299                          * than the pending chunks we processed before. If we
1300                          * do not take this special care we can end up getting
1301                          * 2 pending chunks that start at the same physical
1302                          * device offsets because the end offset of a pinned
1303                          * chunk can be equal to the start offset of some
1304                          * pending chunk.
1305                          */
1306                         end = map->stripes[i].physical + em->orig_block_len;
1307                         if (end > *start) {
1308                                 *start = end;
1309                                 ret = 1;
1310                         }
1311                 }
1312         }
1313         if (search_list != &fs_info->pinned_chunks) {
1314                 search_list = &fs_info->pinned_chunks;
1315                 goto again;
1316         }
1317
1318         return ret;
1319 }
1320
1321
1322 /*
1323  * find_free_dev_extent_start - find free space in the specified device
1324  * @device:       the device which we search the free space in
1325  * @num_bytes:    the size of the free space that we need
1326  * @search_start: the position from which to begin the search
1327  * @start:        store the start of the free space.
1328  * @len:          the size of the free space. that we find, or the size
1329  *                of the max free space if we don't find suitable free space
1330  *
1331  * this uses a pretty simple search, the expectation is that it is
1332  * called very infrequently and that a given device has a small number
1333  * of extents
1334  *
1335  * @start is used to store the start of the free space if we find. But if we
1336  * don't find suitable free space, it will be used to store the start position
1337  * of the max free space.
1338  *
1339  * @len is used to store the size of the free space that we find.
1340  * But if we don't find suitable free space, it is used to store the size of
1341  * the max free space.
1342  */
1343 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1344                                struct btrfs_device *device, u64 num_bytes,
1345                                u64 search_start, u64 *start, u64 *len)
1346 {
1347         struct btrfs_fs_info *fs_info = device->fs_info;
1348         struct btrfs_root *root = fs_info->dev_root;
1349         struct btrfs_key key;
1350         struct btrfs_dev_extent *dev_extent;
1351         struct btrfs_path *path;
1352         u64 hole_size;
1353         u64 max_hole_start;
1354         u64 max_hole_size;
1355         u64 extent_end;
1356         u64 search_end = device->total_bytes;
1357         int ret;
1358         int slot;
1359         struct extent_buffer *l;
1360
1361         /*
1362          * We don't want to overwrite the superblock on the drive nor any area
1363          * used by the boot loader (grub for example), so we make sure to start
1364          * at an offset of at least 1MB.
1365          */
1366         search_start = max_t(u64, search_start, SZ_1M);
1367
1368         path = btrfs_alloc_path();
1369         if (!path)
1370                 return -ENOMEM;
1371
1372         max_hole_start = search_start;
1373         max_hole_size = 0;
1374
1375 again:
1376         if (search_start >= search_end ||
1377                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1378                 ret = -ENOSPC;
1379                 goto out;
1380         }
1381
1382         path->reada = READA_FORWARD;
1383         path->search_commit_root = 1;
1384         path->skip_locking = 1;
1385
1386         key.objectid = device->devid;
1387         key.offset = search_start;
1388         key.type = BTRFS_DEV_EXTENT_KEY;
1389
1390         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1391         if (ret < 0)
1392                 goto out;
1393         if (ret > 0) {
1394                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1395                 if (ret < 0)
1396                         goto out;
1397         }
1398
1399         while (1) {
1400                 l = path->nodes[0];
1401                 slot = path->slots[0];
1402                 if (slot >= btrfs_header_nritems(l)) {
1403                         ret = btrfs_next_leaf(root, path);
1404                         if (ret == 0)
1405                                 continue;
1406                         if (ret < 0)
1407                                 goto out;
1408
1409                         break;
1410                 }
1411                 btrfs_item_key_to_cpu(l, &key, slot);
1412
1413                 if (key.objectid < device->devid)
1414                         goto next;
1415
1416                 if (key.objectid > device->devid)
1417                         break;
1418
1419                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1420                         goto next;
1421
1422                 if (key.offset > search_start) {
1423                         hole_size = key.offset - search_start;
1424
1425                         /*
1426                          * Have to check before we set max_hole_start, otherwise
1427                          * we could end up sending back this offset anyway.
1428                          */
1429                         if (contains_pending_extent(transaction, device,
1430                                                     &search_start,
1431                                                     hole_size)) {
1432                                 if (key.offset >= search_start) {
1433                                         hole_size = key.offset - search_start;
1434                                 } else {
1435                                         WARN_ON_ONCE(1);
1436                                         hole_size = 0;
1437                                 }
1438                         }
1439
1440                         if (hole_size > max_hole_size) {
1441                                 max_hole_start = search_start;
1442                                 max_hole_size = hole_size;
1443                         }
1444
1445                         /*
1446                          * If this free space is greater than which we need,
1447                          * it must be the max free space that we have found
1448                          * until now, so max_hole_start must point to the start
1449                          * of this free space and the length of this free space
1450                          * is stored in max_hole_size. Thus, we return
1451                          * max_hole_start and max_hole_size and go back to the
1452                          * caller.
1453                          */
1454                         if (hole_size >= num_bytes) {
1455                                 ret = 0;
1456                                 goto out;
1457                         }
1458                 }
1459
1460                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1461                 extent_end = key.offset + btrfs_dev_extent_length(l,
1462                                                                   dev_extent);
1463                 if (extent_end > search_start)
1464                         search_start = extent_end;
1465 next:
1466                 path->slots[0]++;
1467                 cond_resched();
1468         }
1469
1470         /*
1471          * At this point, search_start should be the end of
1472          * allocated dev extents, and when shrinking the device,
1473          * search_end may be smaller than search_start.
1474          */
1475         if (search_end > search_start) {
1476                 hole_size = search_end - search_start;
1477
1478                 if (contains_pending_extent(transaction, device, &search_start,
1479                                             hole_size)) {
1480                         btrfs_release_path(path);
1481                         goto again;
1482                 }
1483
1484                 if (hole_size > max_hole_size) {
1485                         max_hole_start = search_start;
1486                         max_hole_size = hole_size;
1487                 }
1488         }
1489
1490         /* See above. */
1491         if (max_hole_size < num_bytes)
1492                 ret = -ENOSPC;
1493         else
1494                 ret = 0;
1495
1496 out:
1497         btrfs_free_path(path);
1498         *start = max_hole_start;
1499         if (len)
1500                 *len = max_hole_size;
1501         return ret;
1502 }
1503
1504 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1505                          struct btrfs_device *device, u64 num_bytes,
1506                          u64 *start, u64 *len)
1507 {
1508         /* FIXME use last free of some kind */
1509         return find_free_dev_extent_start(trans->transaction, device,
1510                                           num_bytes, 0, start, len);
1511 }
1512
1513 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1514                           struct btrfs_device *device,
1515                           u64 start, u64 *dev_extent_len)
1516 {
1517         struct btrfs_fs_info *fs_info = device->fs_info;
1518         struct btrfs_root *root = fs_info->dev_root;
1519         int ret;
1520         struct btrfs_path *path;
1521         struct btrfs_key key;
1522         struct btrfs_key found_key;
1523         struct extent_buffer *leaf = NULL;
1524         struct btrfs_dev_extent *extent = NULL;
1525
1526         path = btrfs_alloc_path();
1527         if (!path)
1528                 return -ENOMEM;
1529
1530         key.objectid = device->devid;
1531         key.offset = start;
1532         key.type = BTRFS_DEV_EXTENT_KEY;
1533 again:
1534         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1535         if (ret > 0) {
1536                 ret = btrfs_previous_item(root, path, key.objectid,
1537                                           BTRFS_DEV_EXTENT_KEY);
1538                 if (ret)
1539                         goto out;
1540                 leaf = path->nodes[0];
1541                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1542                 extent = btrfs_item_ptr(leaf, path->slots[0],
1543                                         struct btrfs_dev_extent);
1544                 BUG_ON(found_key.offset > start || found_key.offset +
1545                        btrfs_dev_extent_length(leaf, extent) < start);
1546                 key = found_key;
1547                 btrfs_release_path(path);
1548                 goto again;
1549         } else if (ret == 0) {
1550                 leaf = path->nodes[0];
1551                 extent = btrfs_item_ptr(leaf, path->slots[0],
1552                                         struct btrfs_dev_extent);
1553         } else {
1554                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1555                 goto out;
1556         }
1557
1558         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1559
1560         ret = btrfs_del_item(trans, root, path);
1561         if (ret) {
1562                 btrfs_handle_fs_error(fs_info, ret,
1563                                       "Failed to remove dev extent item");
1564         } else {
1565                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1566         }
1567 out:
1568         btrfs_free_path(path);
1569         return ret;
1570 }
1571
1572 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1573                                   struct btrfs_device *device,
1574                                   u64 chunk_offset, u64 start, u64 num_bytes)
1575 {
1576         int ret;
1577         struct btrfs_path *path;
1578         struct btrfs_fs_info *fs_info = device->fs_info;
1579         struct btrfs_root *root = fs_info->dev_root;
1580         struct btrfs_dev_extent *extent;
1581         struct extent_buffer *leaf;
1582         struct btrfs_key key;
1583
1584         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1585         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1586         path = btrfs_alloc_path();
1587         if (!path)
1588                 return -ENOMEM;
1589
1590         key.objectid = device->devid;
1591         key.offset = start;
1592         key.type = BTRFS_DEV_EXTENT_KEY;
1593         ret = btrfs_insert_empty_item(trans, root, path, &key,
1594                                       sizeof(*extent));
1595         if (ret)
1596                 goto out;
1597
1598         leaf = path->nodes[0];
1599         extent = btrfs_item_ptr(leaf, path->slots[0],
1600                                 struct btrfs_dev_extent);
1601         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1602                                         BTRFS_CHUNK_TREE_OBJECTID);
1603         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1604                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1605         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1606
1607         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1608         btrfs_mark_buffer_dirty(leaf);
1609 out:
1610         btrfs_free_path(path);
1611         return ret;
1612 }
1613
1614 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1615 {
1616         struct extent_map_tree *em_tree;
1617         struct extent_map *em;
1618         struct rb_node *n;
1619         u64 ret = 0;
1620
1621         em_tree = &fs_info->mapping_tree.map_tree;
1622         read_lock(&em_tree->lock);
1623         n = rb_last(&em_tree->map.rb_root);
1624         if (n) {
1625                 em = rb_entry(n, struct extent_map, rb_node);
1626                 ret = em->start + em->len;
1627         }
1628         read_unlock(&em_tree->lock);
1629
1630         return ret;
1631 }
1632
1633 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1634                                     u64 *devid_ret)
1635 {
1636         int ret;
1637         struct btrfs_key key;
1638         struct btrfs_key found_key;
1639         struct btrfs_path *path;
1640
1641         path = btrfs_alloc_path();
1642         if (!path)
1643                 return -ENOMEM;
1644
1645         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1646         key.type = BTRFS_DEV_ITEM_KEY;
1647         key.offset = (u64)-1;
1648
1649         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1650         if (ret < 0)
1651                 goto error;
1652
1653         BUG_ON(ret == 0); /* Corruption */
1654
1655         ret = btrfs_previous_item(fs_info->chunk_root, path,
1656                                   BTRFS_DEV_ITEMS_OBJECTID,
1657                                   BTRFS_DEV_ITEM_KEY);
1658         if (ret) {
1659                 *devid_ret = 1;
1660         } else {
1661                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1662                                       path->slots[0]);
1663                 *devid_ret = found_key.offset + 1;
1664         }
1665         ret = 0;
1666 error:
1667         btrfs_free_path(path);
1668         return ret;
1669 }
1670
1671 /*
1672  * the device information is stored in the chunk root
1673  * the btrfs_device struct should be fully filled in
1674  */
1675 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1676                             struct btrfs_device *device)
1677 {
1678         int ret;
1679         struct btrfs_path *path;
1680         struct btrfs_dev_item *dev_item;
1681         struct extent_buffer *leaf;
1682         struct btrfs_key key;
1683         unsigned long ptr;
1684
1685         path = btrfs_alloc_path();
1686         if (!path)
1687                 return -ENOMEM;
1688
1689         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1690         key.type = BTRFS_DEV_ITEM_KEY;
1691         key.offset = device->devid;
1692
1693         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1694                                       &key, sizeof(*dev_item));
1695         if (ret)
1696                 goto out;
1697
1698         leaf = path->nodes[0];
1699         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1700
1701         btrfs_set_device_id(leaf, dev_item, device->devid);
1702         btrfs_set_device_generation(leaf, dev_item, 0);
1703         btrfs_set_device_type(leaf, dev_item, device->type);
1704         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1705         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1706         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1707         btrfs_set_device_total_bytes(leaf, dev_item,
1708                                      btrfs_device_get_disk_total_bytes(device));
1709         btrfs_set_device_bytes_used(leaf, dev_item,
1710                                     btrfs_device_get_bytes_used(device));
1711         btrfs_set_device_group(leaf, dev_item, 0);
1712         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1713         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1714         btrfs_set_device_start_offset(leaf, dev_item, 0);
1715
1716         ptr = btrfs_device_uuid(dev_item);
1717         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1718         ptr = btrfs_device_fsid(dev_item);
1719         write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1720         btrfs_mark_buffer_dirty(leaf);
1721
1722         ret = 0;
1723 out:
1724         btrfs_free_path(path);
1725         return ret;
1726 }
1727
1728 /*
1729  * Function to update ctime/mtime for a given device path.
1730  * Mainly used for ctime/mtime based probe like libblkid.
1731  */
1732 static void update_dev_time(const char *path_name)
1733 {
1734         struct file *filp;
1735
1736         filp = filp_open(path_name, O_RDWR, 0);
1737         if (IS_ERR(filp))
1738                 return;
1739         file_update_time(filp);
1740         filp_close(filp, NULL);
1741 }
1742
1743 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1744                              struct btrfs_device *device)
1745 {
1746         struct btrfs_root *root = fs_info->chunk_root;
1747         int ret;
1748         struct btrfs_path *path;
1749         struct btrfs_key key;
1750         struct btrfs_trans_handle *trans;
1751
1752         path = btrfs_alloc_path();
1753         if (!path)
1754                 return -ENOMEM;
1755
1756         trans = btrfs_start_transaction(root, 0);
1757         if (IS_ERR(trans)) {
1758                 btrfs_free_path(path);
1759                 return PTR_ERR(trans);
1760         }
1761         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1762         key.type = BTRFS_DEV_ITEM_KEY;
1763         key.offset = device->devid;
1764
1765         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1766         if (ret) {
1767                 if (ret > 0)
1768                         ret = -ENOENT;
1769                 btrfs_abort_transaction(trans, ret);
1770                 btrfs_end_transaction(trans);
1771                 goto out;
1772         }
1773
1774         ret = btrfs_del_item(trans, root, path);
1775         if (ret) {
1776                 btrfs_abort_transaction(trans, ret);
1777                 btrfs_end_transaction(trans);
1778         }
1779
1780 out:
1781         btrfs_free_path(path);
1782         if (!ret)
1783                 ret = btrfs_commit_transaction(trans);
1784         return ret;
1785 }
1786
1787 /*
1788  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1789  * filesystem. It's up to the caller to adjust that number regarding eg. device
1790  * replace.
1791  */
1792 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1793                 u64 num_devices)
1794 {
1795         u64 all_avail;
1796         unsigned seq;
1797         int i;
1798
1799         do {
1800                 seq = read_seqbegin(&fs_info->profiles_lock);
1801
1802                 all_avail = fs_info->avail_data_alloc_bits |
1803                             fs_info->avail_system_alloc_bits |
1804                             fs_info->avail_metadata_alloc_bits;
1805         } while (read_seqretry(&fs_info->profiles_lock, seq));
1806
1807         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1808                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1809                         continue;
1810
1811                 if (num_devices < btrfs_raid_array[i].devs_min) {
1812                         int ret = btrfs_raid_array[i].mindev_error;
1813
1814                         if (ret)
1815                                 return ret;
1816                 }
1817         }
1818
1819         return 0;
1820 }
1821
1822 static struct btrfs_device * btrfs_find_next_active_device(
1823                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1824 {
1825         struct btrfs_device *next_device;
1826
1827         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1828                 if (next_device != device &&
1829                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1830                     && next_device->bdev)
1831                         return next_device;
1832         }
1833
1834         return NULL;
1835 }
1836
1837 /*
1838  * Helper function to check if the given device is part of s_bdev / latest_bdev
1839  * and replace it with the provided or the next active device, in the context
1840  * where this function called, there should be always be another device (or
1841  * this_dev) which is active.
1842  */
1843 void btrfs_assign_next_active_device(struct btrfs_device *device,
1844                                      struct btrfs_device *this_dev)
1845 {
1846         struct btrfs_fs_info *fs_info = device->fs_info;
1847         struct btrfs_device *next_device;
1848
1849         if (this_dev)
1850                 next_device = this_dev;
1851         else
1852                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1853                                                                 device);
1854         ASSERT(next_device);
1855
1856         if (fs_info->sb->s_bdev &&
1857                         (fs_info->sb->s_bdev == device->bdev))
1858                 fs_info->sb->s_bdev = next_device->bdev;
1859
1860         if (fs_info->fs_devices->latest_bdev == device->bdev)
1861                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1862 }
1863
1864 /*
1865  * Return btrfs_fs_devices::num_devices excluding the device that's being
1866  * currently replaced.
1867  */
1868 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1869 {
1870         u64 num_devices = fs_info->fs_devices->num_devices;
1871
1872         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1873         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1874                 ASSERT(num_devices > 1);
1875                 num_devices--;
1876         }
1877         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1878
1879         return num_devices;
1880 }
1881
1882 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1883                 u64 devid)
1884 {
1885         struct btrfs_device *device;
1886         struct btrfs_fs_devices *cur_devices;
1887         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1888         u64 num_devices;
1889         int ret = 0;
1890
1891         mutex_lock(&uuid_mutex);
1892
1893         num_devices = btrfs_num_devices(fs_info);
1894
1895         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1896         if (ret)
1897                 goto out;
1898
1899         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
1900
1901         if (IS_ERR(device)) {
1902                 if (PTR_ERR(device) == -ENOENT &&
1903                     strcmp(device_path, "missing") == 0)
1904                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1905                 else
1906                         ret = PTR_ERR(device);
1907                 goto out;
1908         }
1909
1910         if (btrfs_pinned_by_swapfile(fs_info, device)) {
1911                 btrfs_warn_in_rcu(fs_info,
1912                   "cannot remove device %s (devid %llu) due to active swapfile",
1913                                   rcu_str_deref(device->name), device->devid);
1914                 ret = -ETXTBSY;
1915                 goto out;
1916         }
1917
1918         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1919                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1920                 goto out;
1921         }
1922
1923         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1924             fs_info->fs_devices->rw_devices == 1) {
1925                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1926                 goto out;
1927         }
1928
1929         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1930                 mutex_lock(&fs_info->chunk_mutex);
1931                 list_del_init(&device->dev_alloc_list);
1932                 device->fs_devices->rw_devices--;
1933                 mutex_unlock(&fs_info->chunk_mutex);
1934         }
1935
1936         mutex_unlock(&uuid_mutex);
1937         ret = btrfs_shrink_device(device, 0);
1938         mutex_lock(&uuid_mutex);
1939         if (ret)
1940                 goto error_undo;
1941
1942         /*
1943          * TODO: the superblock still includes this device in its num_devices
1944          * counter although write_all_supers() is not locked out. This
1945          * could give a filesystem state which requires a degraded mount.
1946          */
1947         ret = btrfs_rm_dev_item(fs_info, device);
1948         if (ret)
1949                 goto error_undo;
1950
1951         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1952         btrfs_scrub_cancel_dev(fs_info, device);
1953
1954         /*
1955          * the device list mutex makes sure that we don't change
1956          * the device list while someone else is writing out all
1957          * the device supers. Whoever is writing all supers, should
1958          * lock the device list mutex before getting the number of
1959          * devices in the super block (super_copy). Conversely,
1960          * whoever updates the number of devices in the super block
1961          * (super_copy) should hold the device list mutex.
1962          */
1963
1964         /*
1965          * In normal cases the cur_devices == fs_devices. But in case
1966          * of deleting a seed device, the cur_devices should point to
1967          * its own fs_devices listed under the fs_devices->seed.
1968          */
1969         cur_devices = device->fs_devices;
1970         mutex_lock(&fs_devices->device_list_mutex);
1971         list_del_rcu(&device->dev_list);
1972
1973         cur_devices->num_devices--;
1974         cur_devices->total_devices--;
1975         /* Update total_devices of the parent fs_devices if it's seed */
1976         if (cur_devices != fs_devices)
1977                 fs_devices->total_devices--;
1978
1979         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1980                 cur_devices->missing_devices--;
1981
1982         btrfs_assign_next_active_device(device, NULL);
1983
1984         if (device->bdev) {
1985                 cur_devices->open_devices--;
1986                 /* remove sysfs entry */
1987                 btrfs_sysfs_rm_device_link(fs_devices, device);
1988         }
1989
1990         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
1991         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
1992         mutex_unlock(&fs_devices->device_list_mutex);
1993
1994         /*
1995          * at this point, the device is zero sized and detached from
1996          * the devices list.  All that's left is to zero out the old
1997          * supers and free the device.
1998          */
1999         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2000                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2001
2002         btrfs_close_bdev(device);
2003         call_rcu(&device->rcu, free_device_rcu);
2004
2005         if (cur_devices->open_devices == 0) {
2006                 while (fs_devices) {
2007                         if (fs_devices->seed == cur_devices) {
2008                                 fs_devices->seed = cur_devices->seed;
2009                                 break;
2010                         }
2011                         fs_devices = fs_devices->seed;
2012                 }
2013                 cur_devices->seed = NULL;
2014                 close_fs_devices(cur_devices);
2015                 free_fs_devices(cur_devices);
2016         }
2017
2018 out:
2019         mutex_unlock(&uuid_mutex);
2020         return ret;
2021
2022 error_undo:
2023         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2024                 mutex_lock(&fs_info->chunk_mutex);
2025                 list_add(&device->dev_alloc_list,
2026                          &fs_devices->alloc_list);
2027                 device->fs_devices->rw_devices++;
2028                 mutex_unlock(&fs_info->chunk_mutex);
2029         }
2030         goto out;
2031 }
2032
2033 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2034 {
2035         struct btrfs_fs_devices *fs_devices;
2036
2037         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2038
2039         /*
2040          * in case of fs with no seed, srcdev->fs_devices will point
2041          * to fs_devices of fs_info. However when the dev being replaced is
2042          * a seed dev it will point to the seed's local fs_devices. In short
2043          * srcdev will have its correct fs_devices in both the cases.
2044          */
2045         fs_devices = srcdev->fs_devices;
2046
2047         list_del_rcu(&srcdev->dev_list);
2048         list_del(&srcdev->dev_alloc_list);
2049         fs_devices->num_devices--;
2050         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2051                 fs_devices->missing_devices--;
2052
2053         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2054                 fs_devices->rw_devices--;
2055
2056         if (srcdev->bdev)
2057                 fs_devices->open_devices--;
2058 }
2059
2060 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2061                                       struct btrfs_device *srcdev)
2062 {
2063         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2064
2065         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2066                 /* zero out the old super if it is writable */
2067                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2068         }
2069
2070         btrfs_close_bdev(srcdev);
2071         call_rcu(&srcdev->rcu, free_device_rcu);
2072
2073         /* if this is no devs we rather delete the fs_devices */
2074         if (!fs_devices->num_devices) {
2075                 struct btrfs_fs_devices *tmp_fs_devices;
2076
2077                 /*
2078                  * On a mounted FS, num_devices can't be zero unless it's a
2079                  * seed. In case of a seed device being replaced, the replace
2080                  * target added to the sprout FS, so there will be no more
2081                  * device left under the seed FS.
2082                  */
2083                 ASSERT(fs_devices->seeding);
2084
2085                 tmp_fs_devices = fs_info->fs_devices;
2086                 while (tmp_fs_devices) {
2087                         if (tmp_fs_devices->seed == fs_devices) {
2088                                 tmp_fs_devices->seed = fs_devices->seed;
2089                                 break;
2090                         }
2091                         tmp_fs_devices = tmp_fs_devices->seed;
2092                 }
2093                 fs_devices->seed = NULL;
2094                 close_fs_devices(fs_devices);
2095                 free_fs_devices(fs_devices);
2096         }
2097 }
2098
2099 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2100 {
2101         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2102
2103         WARN_ON(!tgtdev);
2104         mutex_lock(&fs_devices->device_list_mutex);
2105
2106         btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2107
2108         if (tgtdev->bdev)
2109                 fs_devices->open_devices--;
2110
2111         fs_devices->num_devices--;
2112
2113         btrfs_assign_next_active_device(tgtdev, NULL);
2114
2115         list_del_rcu(&tgtdev->dev_list);
2116
2117         mutex_unlock(&fs_devices->device_list_mutex);
2118
2119         /*
2120          * The update_dev_time() with in btrfs_scratch_superblocks()
2121          * may lead to a call to btrfs_show_devname() which will try
2122          * to hold device_list_mutex. And here this device
2123          * is already out of device list, so we don't have to hold
2124          * the device_list_mutex lock.
2125          */
2126         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2127
2128         btrfs_close_bdev(tgtdev);
2129         call_rcu(&tgtdev->rcu, free_device_rcu);
2130 }
2131
2132 static struct btrfs_device *btrfs_find_device_by_path(
2133                 struct btrfs_fs_info *fs_info, const char *device_path)
2134 {
2135         int ret = 0;
2136         struct btrfs_super_block *disk_super;
2137         u64 devid;
2138         u8 *dev_uuid;
2139         struct block_device *bdev;
2140         struct buffer_head *bh;
2141         struct btrfs_device *device;
2142
2143         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2144                                     fs_info->bdev_holder, 0, &bdev, &bh);
2145         if (ret)
2146                 return ERR_PTR(ret);
2147         disk_super = (struct btrfs_super_block *)bh->b_data;
2148         devid = btrfs_stack_device_id(&disk_super->dev_item);
2149         dev_uuid = disk_super->dev_item.uuid;
2150         device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2151         brelse(bh);
2152         if (!device)
2153                 device = ERR_PTR(-ENOENT);
2154         blkdev_put(bdev, FMODE_READ);
2155         return device;
2156 }
2157
2158 static struct btrfs_device *btrfs_find_device_missing_or_by_path(
2159                 struct btrfs_fs_info *fs_info, const char *device_path)
2160 {
2161         struct btrfs_device *device = NULL;
2162         if (strcmp(device_path, "missing") == 0) {
2163                 struct list_head *devices;
2164                 struct btrfs_device *tmp;
2165
2166                 devices = &fs_info->fs_devices->devices;
2167                 list_for_each_entry(tmp, devices, dev_list) {
2168                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2169                                         &tmp->dev_state) && !tmp->bdev) {
2170                                 device = tmp;
2171                                 break;
2172                         }
2173                 }
2174
2175                 if (!device)
2176                         return ERR_PTR(-ENOENT);
2177         } else {
2178                 device = btrfs_find_device_by_path(fs_info, device_path);
2179         }
2180
2181         return device;
2182 }
2183
2184 /*
2185  * Lookup a device given by device id, or the path if the id is 0.
2186  */
2187 struct btrfs_device *btrfs_find_device_by_devspec(
2188                 struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
2189 {
2190         struct btrfs_device *device;
2191
2192         if (devid) {
2193                 device = btrfs_find_device(fs_info, devid, NULL, NULL);
2194                 if (!device)
2195                         return ERR_PTR(-ENOENT);
2196         } else {
2197                 if (!devpath || !devpath[0])
2198                         return ERR_PTR(-EINVAL);
2199                 device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
2200         }
2201         return device;
2202 }
2203
2204 /*
2205  * does all the dirty work required for changing file system's UUID.
2206  */
2207 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2208 {
2209         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2210         struct btrfs_fs_devices *old_devices;
2211         struct btrfs_fs_devices *seed_devices;
2212         struct btrfs_super_block *disk_super = fs_info->super_copy;
2213         struct btrfs_device *device;
2214         u64 super_flags;
2215
2216         lockdep_assert_held(&uuid_mutex);
2217         if (!fs_devices->seeding)
2218                 return -EINVAL;
2219
2220         seed_devices = alloc_fs_devices(NULL);
2221         if (IS_ERR(seed_devices))
2222                 return PTR_ERR(seed_devices);
2223
2224         old_devices = clone_fs_devices(fs_devices);
2225         if (IS_ERR(old_devices)) {
2226                 kfree(seed_devices);
2227                 return PTR_ERR(old_devices);
2228         }
2229
2230         list_add(&old_devices->fs_list, &fs_uuids);
2231
2232         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2233         seed_devices->opened = 1;
2234         INIT_LIST_HEAD(&seed_devices->devices);
2235         INIT_LIST_HEAD(&seed_devices->alloc_list);
2236         mutex_init(&seed_devices->device_list_mutex);
2237
2238         mutex_lock(&fs_devices->device_list_mutex);
2239         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2240                               synchronize_rcu);
2241         list_for_each_entry(device, &seed_devices->devices, dev_list)
2242                 device->fs_devices = seed_devices;
2243
2244         mutex_lock(&fs_info->chunk_mutex);
2245         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2246         mutex_unlock(&fs_info->chunk_mutex);
2247
2248         fs_devices->seeding = 0;
2249         fs_devices->num_devices = 0;
2250         fs_devices->open_devices = 0;
2251         fs_devices->missing_devices = 0;
2252         fs_devices->rotating = 0;
2253         fs_devices->seed = seed_devices;
2254
2255         generate_random_uuid(fs_devices->fsid);
2256         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2257         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2258         mutex_unlock(&fs_devices->device_list_mutex);
2259
2260         super_flags = btrfs_super_flags(disk_super) &
2261                       ~BTRFS_SUPER_FLAG_SEEDING;
2262         btrfs_set_super_flags(disk_super, super_flags);
2263
2264         return 0;
2265 }
2266
2267 /*
2268  * Store the expected generation for seed devices in device items.
2269  */
2270 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2271                                struct btrfs_fs_info *fs_info)
2272 {
2273         struct btrfs_root *root = fs_info->chunk_root;
2274         struct btrfs_path *path;
2275         struct extent_buffer *leaf;
2276         struct btrfs_dev_item *dev_item;
2277         struct btrfs_device *device;
2278         struct btrfs_key key;
2279         u8 fs_uuid[BTRFS_FSID_SIZE];
2280         u8 dev_uuid[BTRFS_UUID_SIZE];
2281         u64 devid;
2282         int ret;
2283
2284         path = btrfs_alloc_path();
2285         if (!path)
2286                 return -ENOMEM;
2287
2288         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2289         key.offset = 0;
2290         key.type = BTRFS_DEV_ITEM_KEY;
2291
2292         while (1) {
2293                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2294                 if (ret < 0)
2295                         goto error;
2296
2297                 leaf = path->nodes[0];
2298 next_slot:
2299                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2300                         ret = btrfs_next_leaf(root, path);
2301                         if (ret > 0)
2302                                 break;
2303                         if (ret < 0)
2304                                 goto error;
2305                         leaf = path->nodes[0];
2306                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2307                         btrfs_release_path(path);
2308                         continue;
2309                 }
2310
2311                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2312                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2313                     key.type != BTRFS_DEV_ITEM_KEY)
2314                         break;
2315
2316                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2317                                           struct btrfs_dev_item);
2318                 devid = btrfs_device_id(leaf, dev_item);
2319                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2320                                    BTRFS_UUID_SIZE);
2321                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2322                                    BTRFS_FSID_SIZE);
2323                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2324                 BUG_ON(!device); /* Logic error */
2325
2326                 if (device->fs_devices->seeding) {
2327                         btrfs_set_device_generation(leaf, dev_item,
2328                                                     device->generation);
2329                         btrfs_mark_buffer_dirty(leaf);
2330                 }
2331
2332                 path->slots[0]++;
2333                 goto next_slot;
2334         }
2335         ret = 0;
2336 error:
2337         btrfs_free_path(path);
2338         return ret;
2339 }
2340
2341 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2342 {
2343         struct btrfs_root *root = fs_info->dev_root;
2344         struct request_queue *q;
2345         struct btrfs_trans_handle *trans;
2346         struct btrfs_device *device;
2347         struct block_device *bdev;
2348         struct super_block *sb = fs_info->sb;
2349         struct rcu_string *name;
2350         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2351         u64 orig_super_total_bytes;
2352         u64 orig_super_num_devices;
2353         int seeding_dev = 0;
2354         int ret = 0;
2355         bool unlocked = false;
2356
2357         if (sb_rdonly(sb) && !fs_devices->seeding)
2358                 return -EROFS;
2359
2360         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2361                                   fs_info->bdev_holder);
2362         if (IS_ERR(bdev))
2363                 return PTR_ERR(bdev);
2364
2365         if (fs_devices->seeding) {
2366                 seeding_dev = 1;
2367                 down_write(&sb->s_umount);
2368                 mutex_lock(&uuid_mutex);
2369         }
2370
2371         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2372
2373         mutex_lock(&fs_devices->device_list_mutex);
2374         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2375                 if (device->bdev == bdev) {
2376                         ret = -EEXIST;
2377                         mutex_unlock(
2378                                 &fs_devices->device_list_mutex);
2379                         goto error;
2380                 }
2381         }
2382         mutex_unlock(&fs_devices->device_list_mutex);
2383
2384         device = btrfs_alloc_device(fs_info, NULL, NULL);
2385         if (IS_ERR(device)) {
2386                 /* we can safely leave the fs_devices entry around */
2387                 ret = PTR_ERR(device);
2388                 goto error;
2389         }
2390
2391         name = rcu_string_strdup(device_path, GFP_KERNEL);
2392         if (!name) {
2393                 ret = -ENOMEM;
2394                 goto error_free_device;
2395         }
2396         rcu_assign_pointer(device->name, name);
2397
2398         trans = btrfs_start_transaction(root, 0);
2399         if (IS_ERR(trans)) {
2400                 ret = PTR_ERR(trans);
2401                 goto error_free_device;
2402         }
2403
2404         q = bdev_get_queue(bdev);
2405         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2406         device->generation = trans->transid;
2407         device->io_width = fs_info->sectorsize;
2408         device->io_align = fs_info->sectorsize;
2409         device->sector_size = fs_info->sectorsize;
2410         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2411                                          fs_info->sectorsize);
2412         device->disk_total_bytes = device->total_bytes;
2413         device->commit_total_bytes = device->total_bytes;
2414         device->fs_info = fs_info;
2415         device->bdev = bdev;
2416         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2417         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2418         device->mode = FMODE_EXCL;
2419         device->dev_stats_valid = 1;
2420         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2421
2422         if (seeding_dev) {
2423                 sb->s_flags &= ~SB_RDONLY;
2424                 ret = btrfs_prepare_sprout(fs_info);
2425                 if (ret) {
2426                         btrfs_abort_transaction(trans, ret);
2427                         goto error_trans;
2428                 }
2429         }
2430
2431         device->fs_devices = fs_devices;
2432
2433         mutex_lock(&fs_devices->device_list_mutex);
2434         mutex_lock(&fs_info->chunk_mutex);
2435         list_add_rcu(&device->dev_list, &fs_devices->devices);
2436         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2437         fs_devices->num_devices++;
2438         fs_devices->open_devices++;
2439         fs_devices->rw_devices++;
2440         fs_devices->total_devices++;
2441         fs_devices->total_rw_bytes += device->total_bytes;
2442
2443         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2444
2445         if (!blk_queue_nonrot(q))
2446                 fs_devices->rotating = 1;
2447
2448         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2449         btrfs_set_super_total_bytes(fs_info->super_copy,
2450                 round_down(orig_super_total_bytes + device->total_bytes,
2451                            fs_info->sectorsize));
2452
2453         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2454         btrfs_set_super_num_devices(fs_info->super_copy,
2455                                     orig_super_num_devices + 1);
2456
2457         /* add sysfs device entry */
2458         btrfs_sysfs_add_device_link(fs_devices, device);
2459
2460         /*
2461          * we've got more storage, clear any full flags on the space
2462          * infos
2463          */
2464         btrfs_clear_space_info_full(fs_info);
2465
2466         mutex_unlock(&fs_info->chunk_mutex);
2467         mutex_unlock(&fs_devices->device_list_mutex);
2468
2469         if (seeding_dev) {
2470                 mutex_lock(&fs_info->chunk_mutex);
2471                 ret = init_first_rw_device(trans, fs_info);
2472                 mutex_unlock(&fs_info->chunk_mutex);
2473                 if (ret) {
2474                         btrfs_abort_transaction(trans, ret);
2475                         goto error_sysfs;
2476                 }
2477         }
2478
2479         ret = btrfs_add_dev_item(trans, device);
2480         if (ret) {
2481                 btrfs_abort_transaction(trans, ret);
2482                 goto error_sysfs;
2483         }
2484
2485         if (seeding_dev) {
2486                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2487
2488                 ret = btrfs_finish_sprout(trans, fs_info);
2489                 if (ret) {
2490                         btrfs_abort_transaction(trans, ret);
2491                         goto error_sysfs;
2492                 }
2493
2494                 /* Sprouting would change fsid of the mounted root,
2495                  * so rename the fsid on the sysfs
2496                  */
2497                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2498                                                 fs_info->fsid);
2499                 if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2500                         btrfs_warn(fs_info,
2501                                    "sysfs: failed to create fsid for sprout");
2502         }
2503
2504         ret = btrfs_commit_transaction(trans);
2505
2506         if (seeding_dev) {
2507                 mutex_unlock(&uuid_mutex);
2508                 up_write(&sb->s_umount);
2509                 unlocked = true;
2510
2511                 if (ret) /* transaction commit */
2512                         return ret;
2513
2514                 ret = btrfs_relocate_sys_chunks(fs_info);
2515                 if (ret < 0)
2516                         btrfs_handle_fs_error(fs_info, ret,
2517                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2518                 trans = btrfs_attach_transaction(root);
2519                 if (IS_ERR(trans)) {
2520                         if (PTR_ERR(trans) == -ENOENT)
2521                                 return 0;
2522                         ret = PTR_ERR(trans);
2523                         trans = NULL;
2524                         goto error_sysfs;
2525                 }
2526                 ret = btrfs_commit_transaction(trans);
2527         }
2528
2529         /* Update ctime/mtime for libblkid */
2530         update_dev_time(device_path);
2531         return ret;
2532
2533 error_sysfs:
2534         btrfs_sysfs_rm_device_link(fs_devices, device);
2535         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2536         mutex_lock(&fs_info->chunk_mutex);
2537         list_del_rcu(&device->dev_list);
2538         list_del(&device->dev_alloc_list);
2539         fs_info->fs_devices->num_devices--;
2540         fs_info->fs_devices->open_devices--;
2541         fs_info->fs_devices->rw_devices--;
2542         fs_info->fs_devices->total_devices--;
2543         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2544         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2545         btrfs_set_super_total_bytes(fs_info->super_copy,
2546                                     orig_super_total_bytes);
2547         btrfs_set_super_num_devices(fs_info->super_copy,
2548                                     orig_super_num_devices);
2549         mutex_unlock(&fs_info->chunk_mutex);
2550         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2551 error_trans:
2552         if (seeding_dev)
2553                 sb->s_flags |= SB_RDONLY;
2554         if (trans)
2555                 btrfs_end_transaction(trans);
2556 error_free_device:
2557         btrfs_free_device(device);
2558 error:
2559         blkdev_put(bdev, FMODE_EXCL);
2560         if (seeding_dev && !unlocked) {
2561                 mutex_unlock(&uuid_mutex);
2562                 up_write(&sb->s_umount);
2563         }
2564         return ret;
2565 }
2566
2567 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2568                                         struct btrfs_device *device)
2569 {
2570         int ret;
2571         struct btrfs_path *path;
2572         struct btrfs_root *root = device->fs_info->chunk_root;
2573         struct btrfs_dev_item *dev_item;
2574         struct extent_buffer *leaf;
2575         struct btrfs_key key;
2576
2577         path = btrfs_alloc_path();
2578         if (!path)
2579                 return -ENOMEM;
2580
2581         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2582         key.type = BTRFS_DEV_ITEM_KEY;
2583         key.offset = device->devid;
2584
2585         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2586         if (ret < 0)
2587                 goto out;
2588
2589         if (ret > 0) {
2590                 ret = -ENOENT;
2591                 goto out;
2592         }
2593
2594         leaf = path->nodes[0];
2595         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2596
2597         btrfs_set_device_id(leaf, dev_item, device->devid);
2598         btrfs_set_device_type(leaf, dev_item, device->type);
2599         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2600         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2601         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2602         btrfs_set_device_total_bytes(leaf, dev_item,
2603                                      btrfs_device_get_disk_total_bytes(device));
2604         btrfs_set_device_bytes_used(leaf, dev_item,
2605                                     btrfs_device_get_bytes_used(device));
2606         btrfs_mark_buffer_dirty(leaf);
2607
2608 out:
2609         btrfs_free_path(path);
2610         return ret;
2611 }
2612
2613 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2614                       struct btrfs_device *device, u64 new_size)
2615 {
2616         struct btrfs_fs_info *fs_info = device->fs_info;
2617         struct btrfs_super_block *super_copy = fs_info->super_copy;
2618         struct btrfs_fs_devices *fs_devices;
2619         u64 old_total;
2620         u64 diff;
2621
2622         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2623                 return -EACCES;
2624
2625         new_size = round_down(new_size, fs_info->sectorsize);
2626
2627         mutex_lock(&fs_info->chunk_mutex);
2628         old_total = btrfs_super_total_bytes(super_copy);
2629         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2630
2631         if (new_size <= device->total_bytes ||
2632             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2633                 mutex_unlock(&fs_info->chunk_mutex);
2634                 return -EINVAL;
2635         }
2636
2637         fs_devices = fs_info->fs_devices;
2638
2639         btrfs_set_super_total_bytes(super_copy,
2640                         round_down(old_total + diff, fs_info->sectorsize));
2641         device->fs_devices->total_rw_bytes += diff;
2642
2643         btrfs_device_set_total_bytes(device, new_size);
2644         btrfs_device_set_disk_total_bytes(device, new_size);
2645         btrfs_clear_space_info_full(device->fs_info);
2646         if (list_empty(&device->resized_list))
2647                 list_add_tail(&device->resized_list,
2648                               &fs_devices->resized_devices);
2649         mutex_unlock(&fs_info->chunk_mutex);
2650
2651         return btrfs_update_device(trans, device);
2652 }
2653
2654 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2655 {
2656         struct btrfs_fs_info *fs_info = trans->fs_info;
2657         struct btrfs_root *root = fs_info->chunk_root;
2658         int ret;
2659         struct btrfs_path *path;
2660         struct btrfs_key key;
2661
2662         path = btrfs_alloc_path();
2663         if (!path)
2664                 return -ENOMEM;
2665
2666         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2667         key.offset = chunk_offset;
2668         key.type = BTRFS_CHUNK_ITEM_KEY;
2669
2670         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2671         if (ret < 0)
2672                 goto out;
2673         else if (ret > 0) { /* Logic error or corruption */
2674                 btrfs_handle_fs_error(fs_info, -ENOENT,
2675                                       "Failed lookup while freeing chunk.");
2676                 ret = -ENOENT;
2677                 goto out;
2678         }
2679
2680         ret = btrfs_del_item(trans, root, path);
2681         if (ret < 0)
2682                 btrfs_handle_fs_error(fs_info, ret,
2683                                       "Failed to delete chunk item.");
2684 out:
2685         btrfs_free_path(path);
2686         return ret;
2687 }
2688
2689 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2690 {
2691         struct btrfs_super_block *super_copy = fs_info->super_copy;
2692         struct btrfs_disk_key *disk_key;
2693         struct btrfs_chunk *chunk;
2694         u8 *ptr;
2695         int ret = 0;
2696         u32 num_stripes;
2697         u32 array_size;
2698         u32 len = 0;
2699         u32 cur;
2700         struct btrfs_key key;
2701
2702         mutex_lock(&fs_info->chunk_mutex);
2703         array_size = btrfs_super_sys_array_size(super_copy);
2704
2705         ptr = super_copy->sys_chunk_array;
2706         cur = 0;
2707
2708         while (cur < array_size) {
2709                 disk_key = (struct btrfs_disk_key *)ptr;
2710                 btrfs_disk_key_to_cpu(&key, disk_key);
2711
2712                 len = sizeof(*disk_key);
2713
2714                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2715                         chunk = (struct btrfs_chunk *)(ptr + len);
2716                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2717                         len += btrfs_chunk_item_size(num_stripes);
2718                 } else {
2719                         ret = -EIO;
2720                         break;
2721                 }
2722                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2723                     key.offset == chunk_offset) {
2724                         memmove(ptr, ptr + len, array_size - (cur + len));
2725                         array_size -= len;
2726                         btrfs_set_super_sys_array_size(super_copy, array_size);
2727                 } else {
2728                         ptr += len;
2729                         cur += len;
2730                 }
2731         }
2732         mutex_unlock(&fs_info->chunk_mutex);
2733         return ret;
2734 }
2735
2736 /*
2737  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2738  * @logical: Logical block offset in bytes.
2739  * @length: Length of extent in bytes.
2740  *
2741  * Return: Chunk mapping or ERR_PTR.
2742  */
2743 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2744                                        u64 logical, u64 length)
2745 {
2746         struct extent_map_tree *em_tree;
2747         struct extent_map *em;
2748
2749         em_tree = &fs_info->mapping_tree.map_tree;
2750         read_lock(&em_tree->lock);
2751         em = lookup_extent_mapping(em_tree, logical, length);
2752         read_unlock(&em_tree->lock);
2753
2754         if (!em) {
2755                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2756                            logical, length);
2757                 return ERR_PTR(-EINVAL);
2758         }
2759
2760         if (em->start > logical || em->start + em->len < logical) {
2761                 btrfs_crit(fs_info,
2762                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2763                            logical, length, em->start, em->start + em->len);
2764                 free_extent_map(em);
2765                 return ERR_PTR(-EINVAL);
2766         }
2767
2768         /* callers are responsible for dropping em's ref. */
2769         return em;
2770 }
2771
2772 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2773 {
2774         struct btrfs_fs_info *fs_info = trans->fs_info;
2775         struct extent_map *em;
2776         struct map_lookup *map;
2777         u64 dev_extent_len = 0;
2778         int i, ret = 0;
2779         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2780
2781         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2782         if (IS_ERR(em)) {
2783                 /*
2784                  * This is a logic error, but we don't want to just rely on the
2785                  * user having built with ASSERT enabled, so if ASSERT doesn't
2786                  * do anything we still error out.
2787                  */
2788                 ASSERT(0);
2789                 return PTR_ERR(em);
2790         }
2791         map = em->map_lookup;
2792         mutex_lock(&fs_info->chunk_mutex);
2793         check_system_chunk(trans, map->type);
2794         mutex_unlock(&fs_info->chunk_mutex);
2795
2796         /*
2797          * Take the device list mutex to prevent races with the final phase of
2798          * a device replace operation that replaces the device object associated
2799          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2800          */
2801         mutex_lock(&fs_devices->device_list_mutex);
2802         for (i = 0; i < map->num_stripes; i++) {
2803                 struct btrfs_device *device = map->stripes[i].dev;
2804                 ret = btrfs_free_dev_extent(trans, device,
2805                                             map->stripes[i].physical,
2806                                             &dev_extent_len);
2807                 if (ret) {
2808                         mutex_unlock(&fs_devices->device_list_mutex);
2809                         btrfs_abort_transaction(trans, ret);
2810                         goto out;
2811                 }
2812
2813                 if (device->bytes_used > 0) {
2814                         mutex_lock(&fs_info->chunk_mutex);
2815                         btrfs_device_set_bytes_used(device,
2816                                         device->bytes_used - dev_extent_len);
2817                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2818                         btrfs_clear_space_info_full(fs_info);
2819                         mutex_unlock(&fs_info->chunk_mutex);
2820                 }
2821
2822                 if (map->stripes[i].dev) {
2823                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2824                         if (ret) {
2825                                 mutex_unlock(&fs_devices->device_list_mutex);
2826                                 btrfs_abort_transaction(trans, ret);
2827                                 goto out;
2828                         }
2829                 }
2830         }
2831         mutex_unlock(&fs_devices->device_list_mutex);
2832
2833         ret = btrfs_free_chunk(trans, chunk_offset);
2834         if (ret) {
2835                 btrfs_abort_transaction(trans, ret);
2836                 goto out;
2837         }
2838
2839         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2840
2841         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2842                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2843                 if (ret) {
2844                         btrfs_abort_transaction(trans, ret);
2845                         goto out;
2846                 }
2847         }
2848
2849         ret = btrfs_remove_block_group(trans, chunk_offset, em);
2850         if (ret) {
2851                 btrfs_abort_transaction(trans, ret);
2852                 goto out;
2853         }
2854
2855 out:
2856         /* once for us */
2857         free_extent_map(em);
2858         return ret;
2859 }
2860
2861 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2862 {
2863         struct btrfs_root *root = fs_info->chunk_root;
2864         struct btrfs_trans_handle *trans;
2865         int ret;
2866
2867         /*
2868          * Prevent races with automatic removal of unused block groups.
2869          * After we relocate and before we remove the chunk with offset
2870          * chunk_offset, automatic removal of the block group can kick in,
2871          * resulting in a failure when calling btrfs_remove_chunk() below.
2872          *
2873          * Make sure to acquire this mutex before doing a tree search (dev
2874          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2875          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2876          * we release the path used to search the chunk/dev tree and before
2877          * the current task acquires this mutex and calls us.
2878          */
2879         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2880
2881         ret = btrfs_can_relocate(fs_info, chunk_offset);
2882         if (ret)
2883                 return -ENOSPC;
2884
2885         /* step one, relocate all the extents inside this chunk */
2886         btrfs_scrub_pause(fs_info);
2887         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2888         btrfs_scrub_continue(fs_info);
2889         if (ret)
2890                 return ret;
2891
2892         /*
2893          * We add the kobjects here (and after forcing data chunk creation)
2894          * since relocation is the only place we'll create chunks of a new
2895          * type at runtime.  The only place where we'll remove the last
2896          * chunk of a type is the call immediately below this one.  Even
2897          * so, we're protected against races with the cleaner thread since
2898          * we're covered by the delete_unused_bgs_mutex.
2899          */
2900         btrfs_add_raid_kobjects(fs_info);
2901
2902         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2903                                                      chunk_offset);
2904         if (IS_ERR(trans)) {
2905                 ret = PTR_ERR(trans);
2906                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2907                 return ret;
2908         }
2909
2910         /*
2911          * step two, delete the device extents and the
2912          * chunk tree entries
2913          */
2914         ret = btrfs_remove_chunk(trans, chunk_offset);
2915         btrfs_end_transaction(trans);
2916         return ret;
2917 }
2918
2919 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2920 {
2921         struct btrfs_root *chunk_root = fs_info->chunk_root;
2922         struct btrfs_path *path;
2923         struct extent_buffer *leaf;
2924         struct btrfs_chunk *chunk;
2925         struct btrfs_key key;
2926         struct btrfs_key found_key;
2927         u64 chunk_type;
2928         bool retried = false;
2929         int failed = 0;
2930         int ret;
2931
2932         path = btrfs_alloc_path();
2933         if (!path)
2934                 return -ENOMEM;
2935
2936 again:
2937         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2938         key.offset = (u64)-1;
2939         key.type = BTRFS_CHUNK_ITEM_KEY;
2940
2941         while (1) {
2942                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2943                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2944                 if (ret < 0) {
2945                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2946                         goto error;
2947                 }
2948                 BUG_ON(ret == 0); /* Corruption */
2949
2950                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2951                                           key.type);
2952                 if (ret)
2953                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2954                 if (ret < 0)
2955                         goto error;
2956                 if (ret > 0)
2957                         break;
2958
2959                 leaf = path->nodes[0];
2960                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2961
2962                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2963                                        struct btrfs_chunk);
2964                 chunk_type = btrfs_chunk_type(leaf, chunk);
2965                 btrfs_release_path(path);
2966
2967                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2968                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
2969                         if (ret == -ENOSPC)
2970                                 failed++;
2971                         else
2972                                 BUG_ON(ret);
2973                 }
2974                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2975
2976                 if (found_key.offset == 0)
2977                         break;
2978                 key.offset = found_key.offset - 1;
2979         }
2980         ret = 0;
2981         if (failed && !retried) {
2982                 failed = 0;
2983                 retried = true;
2984                 goto again;
2985         } else if (WARN_ON(failed && retried)) {
2986                 ret = -ENOSPC;
2987         }
2988 error:
2989         btrfs_free_path(path);
2990         return ret;
2991 }
2992
2993 /*
2994  * return 1 : allocate a data chunk successfully,
2995  * return <0: errors during allocating a data chunk,
2996  * return 0 : no need to allocate a data chunk.
2997  */
2998 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
2999                                       u64 chunk_offset)
3000 {
3001         struct btrfs_block_group_cache *cache;
3002         u64 bytes_used;
3003         u64 chunk_type;
3004
3005         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3006         ASSERT(cache);
3007         chunk_type = cache->flags;
3008         btrfs_put_block_group(cache);
3009
3010         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3011                 spin_lock(&fs_info->data_sinfo->lock);
3012                 bytes_used = fs_info->data_sinfo->bytes_used;
3013                 spin_unlock(&fs_info->data_sinfo->lock);
3014
3015                 if (!bytes_used) {
3016                         struct btrfs_trans_handle *trans;
3017                         int ret;
3018
3019                         trans = btrfs_join_transaction(fs_info->tree_root);
3020                         if (IS_ERR(trans))
3021                                 return PTR_ERR(trans);
3022
3023                         ret = btrfs_force_chunk_alloc(trans,
3024                                                       BTRFS_BLOCK_GROUP_DATA);
3025                         btrfs_end_transaction(trans);
3026                         if (ret < 0)
3027                                 return ret;
3028
3029                         btrfs_add_raid_kobjects(fs_info);
3030
3031                         return 1;
3032                 }
3033         }
3034         return 0;
3035 }
3036
3037 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3038                                struct btrfs_balance_control *bctl)
3039 {
3040         struct btrfs_root *root = fs_info->tree_root;
3041         struct btrfs_trans_handle *trans;
3042         struct btrfs_balance_item *item;
3043         struct btrfs_disk_balance_args disk_bargs;
3044         struct btrfs_path *path;
3045         struct extent_buffer *leaf;
3046         struct btrfs_key key;
3047         int ret, err;
3048
3049         path = btrfs_alloc_path();
3050         if (!path)
3051                 return -ENOMEM;
3052
3053         trans = btrfs_start_transaction(root, 0);
3054         if (IS_ERR(trans)) {
3055                 btrfs_free_path(path);
3056                 return PTR_ERR(trans);
3057         }
3058
3059         key.objectid = BTRFS_BALANCE_OBJECTID;
3060         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3061         key.offset = 0;
3062
3063         ret = btrfs_insert_empty_item(trans, root, path, &key,
3064                                       sizeof(*item));
3065         if (ret)
3066                 goto out;
3067
3068         leaf = path->nodes[0];
3069         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3070
3071         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3072
3073         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3074         btrfs_set_balance_data(leaf, item, &disk_bargs);
3075         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3076         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3077         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3078         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3079
3080         btrfs_set_balance_flags(leaf, item, bctl->flags);
3081
3082         btrfs_mark_buffer_dirty(leaf);
3083 out:
3084         btrfs_free_path(path);
3085         err = btrfs_commit_transaction(trans);
3086         if (err && !ret)
3087                 ret = err;
3088         return ret;
3089 }
3090
3091 static int del_balance_item(struct btrfs_fs_info *fs_info)
3092 {
3093         struct btrfs_root *root = fs_info->tree_root;
3094         struct btrfs_trans_handle *trans;
3095         struct btrfs_path *path;
3096         struct btrfs_key key;
3097         int ret, err;
3098
3099         path = btrfs_alloc_path();
3100         if (!path)
3101                 return -ENOMEM;
3102
3103         trans = btrfs_start_transaction(root, 0);
3104         if (IS_ERR(trans)) {
3105                 btrfs_free_path(path);
3106                 return PTR_ERR(trans);
3107         }
3108
3109         key.objectid = BTRFS_BALANCE_OBJECTID;
3110         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3111         key.offset = 0;
3112
3113         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3114         if (ret < 0)
3115                 goto out;
3116         if (ret > 0) {
3117                 ret = -ENOENT;
3118                 goto out;
3119         }
3120
3121         ret = btrfs_del_item(trans, root, path);
3122 out:
3123         btrfs_free_path(path);
3124         err = btrfs_commit_transaction(trans);
3125         if (err && !ret)
3126                 ret = err;
3127         return ret;
3128 }
3129
3130 /*
3131  * This is a heuristic used to reduce the number of chunks balanced on
3132  * resume after balance was interrupted.
3133  */
3134 static void update_balance_args(struct btrfs_balance_control *bctl)
3135 {
3136         /*
3137          * Turn on soft mode for chunk types that were being converted.
3138          */
3139         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3140                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3141         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3142                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3143         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3144                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3145
3146         /*
3147          * Turn on usage filter if is not already used.  The idea is
3148          * that chunks that we have already balanced should be
3149          * reasonably full.  Don't do it for chunks that are being
3150          * converted - that will keep us from relocating unconverted
3151          * (albeit full) chunks.
3152          */
3153         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3154             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3155             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3156                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3157                 bctl->data.usage = 90;
3158         }
3159         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3160             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3161             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3162                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3163                 bctl->sys.usage = 90;
3164         }
3165         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3166             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3167             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3168                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3169                 bctl->meta.usage = 90;
3170         }
3171 }
3172
3173 /*
3174  * Clear the balance status in fs_info and delete the balance item from disk.
3175  */
3176 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3177 {
3178         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3179         int ret;
3180
3181         BUG_ON(!fs_info->balance_ctl);
3182
3183         spin_lock(&fs_info->balance_lock);
3184         fs_info->balance_ctl = NULL;
3185         spin_unlock(&fs_info->balance_lock);
3186
3187         kfree(bctl);
3188         ret = del_balance_item(fs_info);
3189         if (ret)
3190                 btrfs_handle_fs_error(fs_info, ret, NULL);
3191 }
3192
3193 /*
3194  * Balance filters.  Return 1 if chunk should be filtered out
3195  * (should not be balanced).
3196  */
3197 static int chunk_profiles_filter(u64 chunk_type,
3198                                  struct btrfs_balance_args *bargs)
3199 {
3200         chunk_type = chunk_to_extended(chunk_type) &
3201                                 BTRFS_EXTENDED_PROFILE_MASK;
3202
3203         if (bargs->profiles & chunk_type)
3204                 return 0;
3205
3206         return 1;
3207 }
3208
3209 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3210                               struct btrfs_balance_args *bargs)
3211 {
3212         struct btrfs_block_group_cache *cache;
3213         u64 chunk_used;
3214         u64 user_thresh_min;
3215         u64 user_thresh_max;
3216         int ret = 1;
3217
3218         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3219         chunk_used = btrfs_block_group_used(&cache->item);
3220
3221         if (bargs->usage_min == 0)
3222                 user_thresh_min = 0;
3223         else
3224                 user_thresh_min = div_factor_fine(cache->key.offset,
3225                                         bargs->usage_min);
3226
3227         if (bargs->usage_max == 0)
3228                 user_thresh_max = 1;
3229         else if (bargs->usage_max > 100)
3230                 user_thresh_max = cache->key.offset;
3231         else
3232                 user_thresh_max = div_factor_fine(cache->key.offset,
3233                                         bargs->usage_max);
3234
3235         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3236                 ret = 0;
3237
3238         btrfs_put_block_group(cache);
3239         return ret;
3240 }
3241
3242 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3243                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3244 {
3245         struct btrfs_block_group_cache *cache;
3246         u64 chunk_used, user_thresh;
3247         int ret = 1;
3248
3249         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3250         chunk_used = btrfs_block_group_used(&cache->item);
3251
3252         if (bargs->usage_min == 0)
3253                 user_thresh = 1;
3254         else if (bargs->usage > 100)
3255                 user_thresh = cache->key.offset;
3256         else
3257                 user_thresh = div_factor_fine(cache->key.offset,
3258                                               bargs->usage);
3259
3260         if (chunk_used < user_thresh)
3261                 ret = 0;
3262
3263         btrfs_put_block_group(cache);
3264         return ret;
3265 }
3266
3267 static int chunk_devid_filter(struct extent_buffer *leaf,
3268                               struct btrfs_chunk *chunk,
3269                               struct btrfs_balance_args *bargs)
3270 {
3271         struct btrfs_stripe *stripe;
3272         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3273         int i;
3274
3275         for (i = 0; i < num_stripes; i++) {
3276                 stripe = btrfs_stripe_nr(chunk, i);
3277                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3278                         return 0;
3279         }
3280
3281         return 1;
3282 }
3283
3284 /* [pstart, pend) */
3285 static int chunk_drange_filter(struct extent_buffer *leaf,
3286                                struct btrfs_chunk *chunk,
3287                                struct btrfs_balance_args *bargs)
3288 {
3289         struct btrfs_stripe *stripe;
3290         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3291         u64 stripe_offset;
3292         u64 stripe_length;
3293         int factor;
3294         int i;
3295
3296         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3297                 return 0;
3298
3299         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3300              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3301                 factor = num_stripes / 2;
3302         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3303                 factor = num_stripes - 1;
3304         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3305                 factor = num_stripes - 2;
3306         } else {
3307                 factor = num_stripes;
3308         }
3309
3310         for (i = 0; i < num_stripes; i++) {
3311                 stripe = btrfs_stripe_nr(chunk, i);
3312                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3313                         continue;
3314
3315                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3316                 stripe_length = btrfs_chunk_length(leaf, chunk);
3317                 stripe_length = div_u64(stripe_length, factor);
3318
3319                 if (stripe_offset < bargs->pend &&
3320                     stripe_offset + stripe_length > bargs->pstart)
3321                         return 0;
3322         }
3323
3324         return 1;
3325 }
3326
3327 /* [vstart, vend) */
3328 static int chunk_vrange_filter(struct extent_buffer *leaf,
3329                                struct btrfs_chunk *chunk,
3330                                u64 chunk_offset,
3331                                struct btrfs_balance_args *bargs)
3332 {
3333         if (chunk_offset < bargs->vend &&
3334             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3335                 /* at least part of the chunk is inside this vrange */
3336                 return 0;
3337
3338         return 1;
3339 }
3340
3341 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3342                                struct btrfs_chunk *chunk,
3343                                struct btrfs_balance_args *bargs)
3344 {
3345         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3346
3347         if (bargs->stripes_min <= num_stripes
3348                         && num_stripes <= bargs->stripes_max)
3349                 return 0;
3350
3351         return 1;
3352 }
3353
3354 static int chunk_soft_convert_filter(u64 chunk_type,
3355                                      struct btrfs_balance_args *bargs)
3356 {
3357         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3358                 return 0;
3359
3360         chunk_type = chunk_to_extended(chunk_type) &
3361                                 BTRFS_EXTENDED_PROFILE_MASK;
3362
3363         if (bargs->target == chunk_type)
3364                 return 1;
3365
3366         return 0;
3367 }
3368
3369 static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3370                                 struct extent_buffer *leaf,
3371                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3372 {
3373         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3374         struct btrfs_balance_args *bargs = NULL;
3375         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3376
3377         /* type filter */
3378         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3379               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3380                 return 0;
3381         }
3382
3383         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3384                 bargs = &bctl->data;
3385         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3386                 bargs = &bctl->sys;
3387         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3388                 bargs = &bctl->meta;
3389
3390         /* profiles filter */
3391         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3392             chunk_profiles_filter(chunk_type, bargs)) {
3393                 return 0;
3394         }
3395
3396         /* usage filter */
3397         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3398             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3399                 return 0;
3400         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3401             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3402                 return 0;
3403         }
3404
3405         /* devid filter */
3406         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3407             chunk_devid_filter(leaf, chunk, bargs)) {
3408                 return 0;
3409         }
3410
3411         /* drange filter, makes sense only with devid filter */
3412         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3413             chunk_drange_filter(leaf, chunk, bargs)) {
3414                 return 0;
3415         }
3416
3417         /* vrange filter */
3418         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3419             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3420                 return 0;
3421         }
3422
3423         /* stripes filter */
3424         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3425             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3426                 return 0;
3427         }
3428
3429         /* soft profile changing mode */
3430         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3431             chunk_soft_convert_filter(chunk_type, bargs)) {
3432                 return 0;
3433         }
3434
3435         /*
3436          * limited by count, must be the last filter
3437          */
3438         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3439                 if (bargs->limit == 0)
3440                         return 0;
3441                 else
3442                         bargs->limit--;
3443         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3444                 /*
3445                  * Same logic as the 'limit' filter; the minimum cannot be
3446                  * determined here because we do not have the global information
3447                  * about the count of all chunks that satisfy the filters.
3448                  */
3449                 if (bargs->limit_max == 0)
3450                         return 0;
3451                 else
3452                         bargs->limit_max--;
3453         }
3454
3455         return 1;
3456 }
3457
3458 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3459 {
3460         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3461         struct btrfs_root *chunk_root = fs_info->chunk_root;
3462         struct btrfs_root *dev_root = fs_info->dev_root;
3463         struct list_head *devices;
3464         struct btrfs_device *device;
3465         u64 old_size;
3466         u64 size_to_free;
3467         u64 chunk_type;
3468         struct btrfs_chunk *chunk;
3469         struct btrfs_path *path = NULL;
3470         struct btrfs_key key;
3471         struct btrfs_key found_key;
3472         struct btrfs_trans_handle *trans;
3473         struct extent_buffer *leaf;
3474         int slot;
3475         int ret;
3476         int enospc_errors = 0;
3477         bool counting = true;
3478         /* The single value limit and min/max limits use the same bytes in the */
3479         u64 limit_data = bctl->data.limit;
3480         u64 limit_meta = bctl->meta.limit;
3481         u64 limit_sys = bctl->sys.limit;
3482         u32 count_data = 0;
3483         u32 count_meta = 0;
3484         u32 count_sys = 0;
3485         int chunk_reserved = 0;
3486
3487         /* step one make some room on all the devices */
3488         devices = &fs_info->fs_devices->devices;
3489         list_for_each_entry(device, devices, dev_list) {
3490                 old_size = btrfs_device_get_total_bytes(device);
3491                 size_to_free = div_factor(old_size, 1);
3492                 size_to_free = min_t(u64, size_to_free, SZ_1M);
3493                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3494                     btrfs_device_get_total_bytes(device) -
3495                     btrfs_device_get_bytes_used(device) > size_to_free ||
3496                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3497                         continue;
3498
3499                 ret = btrfs_shrink_device(device, old_size - size_to_free);
3500                 if (ret == -ENOSPC)
3501                         break;
3502                 if (ret) {
3503                         /* btrfs_shrink_device never returns ret > 0 */
3504                         WARN_ON(ret > 0);
3505                         goto error;
3506                 }
3507
3508                 trans = btrfs_start_transaction(dev_root, 0);
3509                 if (IS_ERR(trans)) {
3510                         ret = PTR_ERR(trans);
3511                         btrfs_info_in_rcu(fs_info,
3512                  "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3513                                           rcu_str_deref(device->name), ret,
3514                                           old_size, old_size - size_to_free);
3515                         goto error;
3516                 }
3517
3518                 ret = btrfs_grow_device(trans, device, old_size);
3519                 if (ret) {
3520                         btrfs_end_transaction(trans);
3521                         /* btrfs_grow_device never returns ret > 0 */
3522                         WARN_ON(ret > 0);
3523                         btrfs_info_in_rcu(fs_info,
3524                  "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3525                                           rcu_str_deref(device->name), ret,
3526                                           old_size, old_size - size_to_free);
3527                         goto error;
3528                 }
3529
3530                 btrfs_end_transaction(trans);
3531         }
3532
3533         /* step two, relocate all the chunks */
3534         path = btrfs_alloc_path();
3535         if (!path) {
3536                 ret = -ENOMEM;
3537                 goto error;
3538         }
3539
3540         /* zero out stat counters */
3541         spin_lock(&fs_info->balance_lock);
3542         memset(&bctl->stat, 0, sizeof(bctl->stat));
3543         spin_unlock(&fs_info->balance_lock);
3544 again:
3545         if (!counting) {
3546                 /*
3547                  * The single value limit and min/max limits use the same bytes
3548                  * in the
3549                  */
3550                 bctl->data.limit = limit_data;
3551                 bctl->meta.limit = limit_meta;
3552                 bctl->sys.limit = limit_sys;
3553         }
3554         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3555         key.offset = (u64)-1;
3556         key.type = BTRFS_CHUNK_ITEM_KEY;
3557
3558         while (1) {
3559                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3560                     atomic_read(&fs_info->balance_cancel_req)) {
3561                         ret = -ECANCELED;
3562                         goto error;
3563                 }
3564
3565                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3566                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3567                 if (ret < 0) {
3568                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3569                         goto error;
3570                 }
3571
3572                 /*
3573                  * this shouldn't happen, it means the last relocate
3574                  * failed
3575                  */
3576                 if (ret == 0)
3577                         BUG(); /* FIXME break ? */
3578
3579                 ret = btrfs_previous_item(chunk_root, path, 0,
3580                                           BTRFS_CHUNK_ITEM_KEY);
3581                 if (ret) {
3582                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3583                         ret = 0;
3584                         break;
3585                 }
3586
3587                 leaf = path->nodes[0];
3588                 slot = path->slots[0];
3589                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3590
3591                 if (found_key.objectid != key.objectid) {
3592                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3593                         break;
3594                 }
3595
3596                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3597                 chunk_type = btrfs_chunk_type(leaf, chunk);
3598
3599                 if (!counting) {
3600                         spin_lock(&fs_info->balance_lock);
3601                         bctl->stat.considered++;
3602                         spin_unlock(&fs_info->balance_lock);
3603                 }
3604
3605                 ret = should_balance_chunk(fs_info, leaf, chunk,
3606                                            found_key.offset);
3607
3608                 btrfs_release_path(path);
3609                 if (!ret) {
3610                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3611                         goto loop;
3612                 }
3613
3614                 if (counting) {
3615                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3616                         spin_lock(&fs_info->balance_lock);
3617                         bctl->stat.expected++;
3618                         spin_unlock(&fs_info->balance_lock);
3619
3620                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3621                                 count_data++;
3622                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3623                                 count_sys++;
3624                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3625                                 count_meta++;
3626
3627                         goto loop;
3628                 }
3629
3630                 /*
3631                  * Apply limit_min filter, no need to check if the LIMITS
3632                  * filter is used, limit_min is 0 by default
3633                  */
3634                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3635                                         count_data < bctl->data.limit_min)
3636                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3637                                         count_meta < bctl->meta.limit_min)
3638                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3639                                         count_sys < bctl->sys.limit_min)) {
3640                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3641                         goto loop;
3642                 }
3643
3644                 if (!chunk_reserved) {
3645                         /*
3646                          * We may be relocating the only data chunk we have,
3647                          * which could potentially end up with losing data's
3648                          * raid profile, so lets allocate an empty one in
3649                          * advance.
3650                          */
3651                         ret = btrfs_may_alloc_data_chunk(fs_info,
3652                                                          found_key.offset);
3653                         if (ret < 0) {
3654                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3655                                 goto error;
3656                         } else if (ret == 1) {
3657                                 chunk_reserved = 1;
3658                         }
3659                 }
3660
3661                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3662                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3663                 if (ret == -ENOSPC) {
3664                         enospc_errors++;
3665                 } else if (ret == -ETXTBSY) {
3666                         btrfs_info(fs_info,
3667            "skipping relocation of block group %llu due to active swapfile",
3668                                    found_key.offset);
3669                         ret = 0;
3670                 } else if (ret) {
3671                         goto error;
3672                 } else {
3673                         spin_lock(&fs_info->balance_lock);
3674                         bctl->stat.completed++;
3675                         spin_unlock(&fs_info->balance_lock);
3676                 }
3677 loop:
3678                 if (found_key.offset == 0)
3679                         break;
3680                 key.offset = found_key.offset - 1;
3681         }
3682
3683         if (counting) {
3684                 btrfs_release_path(path);
3685                 counting = false;
3686                 goto again;
3687         }
3688 error:
3689         btrfs_free_path(path);
3690         if (enospc_errors) {
3691                 btrfs_info(fs_info, "%d enospc errors during balance",
3692                            enospc_errors);
3693                 if (!ret)
3694                         ret = -ENOSPC;
3695         }
3696
3697         return ret;
3698 }
3699
3700 /**
3701  * alloc_profile_is_valid - see if a given profile is valid and reduced
3702  * @flags: profile to validate
3703  * @extended: if true @flags is treated as an extended profile
3704  */
3705 static int alloc_profile_is_valid(u64 flags, int extended)
3706 {
3707         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3708                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3709
3710         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3711
3712         /* 1) check that all other bits are zeroed */
3713         if (flags & ~mask)
3714                 return 0;
3715
3716         /* 2) see if profile is reduced */
3717         if (flags == 0)
3718                 return !extended; /* "0" is valid for usual profiles */
3719
3720         /* true if exactly one bit set */
3721         return is_power_of_2(flags);
3722 }
3723
3724 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3725 {
3726         /* cancel requested || normal exit path */
3727         return atomic_read(&fs_info->balance_cancel_req) ||
3728                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3729                  atomic_read(&fs_info->balance_cancel_req) == 0);
3730 }
3731
3732 /* Non-zero return value signifies invalidity */
3733 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3734                 u64 allowed)
3735 {
3736         return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3737                 (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3738                  (bctl_arg->target & ~allowed)));
3739 }
3740
3741 /*
3742  * Should be called with balance mutexe held
3743  */
3744 int btrfs_balance(struct btrfs_fs_info *fs_info,
3745                   struct btrfs_balance_control *bctl,
3746                   struct btrfs_ioctl_balance_args *bargs)