fc9a3d8f62384a46fbdbdc77d937af6ba10a3355
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include "ctree.h"
18 #include "extent_map.h"
19 #include "disk-io.h"
20 #include "transaction.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "async-thread.h"
25 #include "check-integrity.h"
26 #include "rcu-string.h"
27 #include "math.h"
28 #include "dev-replace.h"
29 #include "sysfs.h"
30
31 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
32         [BTRFS_RAID_RAID10] = {
33                 .sub_stripes    = 2,
34                 .dev_stripes    = 1,
35                 .devs_max       = 0,    /* 0 == as many as possible */
36                 .devs_min       = 4,
37                 .tolerated_failures = 1,
38                 .devs_increment = 2,
39                 .ncopies        = 2,
40                 .nparity        = 0,
41                 .raid_name      = "raid10",
42                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
43                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
44         },
45         [BTRFS_RAID_RAID1] = {
46                 .sub_stripes    = 1,
47                 .dev_stripes    = 1,
48                 .devs_max       = 2,
49                 .devs_min       = 2,
50                 .tolerated_failures = 1,
51                 .devs_increment = 2,
52                 .ncopies        = 2,
53                 .nparity        = 0,
54                 .raid_name      = "raid1",
55                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
56                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
57         },
58         [BTRFS_RAID_DUP] = {
59                 .sub_stripes    = 1,
60                 .dev_stripes    = 2,
61                 .devs_max       = 1,
62                 .devs_min       = 1,
63                 .tolerated_failures = 0,
64                 .devs_increment = 1,
65                 .ncopies        = 2,
66                 .nparity        = 0,
67                 .raid_name      = "dup",
68                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
69                 .mindev_error   = 0,
70         },
71         [BTRFS_RAID_RAID0] = {
72                 .sub_stripes    = 1,
73                 .dev_stripes    = 1,
74                 .devs_max       = 0,
75                 .devs_min       = 2,
76                 .tolerated_failures = 0,
77                 .devs_increment = 1,
78                 .ncopies        = 1,
79                 .nparity        = 0,
80                 .raid_name      = "raid0",
81                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
82                 .mindev_error   = 0,
83         },
84         [BTRFS_RAID_SINGLE] = {
85                 .sub_stripes    = 1,
86                 .dev_stripes    = 1,
87                 .devs_max       = 1,
88                 .devs_min       = 1,
89                 .tolerated_failures = 0,
90                 .devs_increment = 1,
91                 .ncopies        = 1,
92                 .nparity        = 0,
93                 .raid_name      = "single",
94                 .bg_flag        = 0,
95                 .mindev_error   = 0,
96         },
97         [BTRFS_RAID_RAID5] = {
98                 .sub_stripes    = 1,
99                 .dev_stripes    = 1,
100                 .devs_max       = 0,
101                 .devs_min       = 2,
102                 .tolerated_failures = 1,
103                 .devs_increment = 1,
104                 .ncopies        = 1,
105                 .nparity        = 1,
106                 .raid_name      = "raid5",
107                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
108                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
109         },
110         [BTRFS_RAID_RAID6] = {
111                 .sub_stripes    = 1,
112                 .dev_stripes    = 1,
113                 .devs_max       = 0,
114                 .devs_min       = 3,
115                 .tolerated_failures = 2,
116                 .devs_increment = 1,
117                 .ncopies        = 1,
118                 .nparity        = 2,
119                 .raid_name      = "raid6",
120                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
121                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
122         },
123 };
124
125 const char *get_raid_name(enum btrfs_raid_types type)
126 {
127         if (type >= BTRFS_NR_RAID_TYPES)
128                 return NULL;
129
130         return btrfs_raid_array[type].raid_name;
131 }
132
133 static int init_first_rw_device(struct btrfs_trans_handle *trans,
134                                 struct btrfs_fs_info *fs_info);
135 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
136 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
137 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
138 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
139 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
140                              enum btrfs_map_op op,
141                              u64 logical, u64 *length,
142                              struct btrfs_bio **bbio_ret,
143                              int mirror_num, int need_raid_map);
144
145 /*
146  * Device locking
147  * ==============
148  *
149  * There are several mutexes that protect manipulation of devices and low-level
150  * structures like chunks but not block groups, extents or files
151  *
152  * uuid_mutex (global lock)
153  * ------------------------
154  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
155  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
156  * device) or requested by the device= mount option
157  *
158  * the mutex can be very coarse and can cover long-running operations
159  *
160  * protects: updates to fs_devices counters like missing devices, rw devices,
161  * seeding, structure cloning, openning/closing devices at mount/umount time
162  *
163  * global::fs_devs - add, remove, updates to the global list
164  *
165  * does not protect: manipulation of the fs_devices::devices list!
166  *
167  * btrfs_device::name - renames (write side), read is RCU
168  *
169  * fs_devices::device_list_mutex (per-fs, with RCU)
170  * ------------------------------------------------
171  * protects updates to fs_devices::devices, ie. adding and deleting
172  *
173  * simple list traversal with read-only actions can be done with RCU protection
174  *
175  * may be used to exclude some operations from running concurrently without any
176  * modifications to the list (see write_all_supers)
177  *
178  * balance_mutex
179  * -------------
180  * protects balance structures (status, state) and context accessed from
181  * several places (internally, ioctl)
182  *
183  * chunk_mutex
184  * -----------
185  * protects chunks, adding or removing during allocation, trim or when a new
186  * device is added/removed
187  *
188  * cleaner_mutex
189  * -------------
190  * a big lock that is held by the cleaner thread and prevents running subvolume
191  * cleaning together with relocation or delayed iputs
192  *
193  *
194  * Lock nesting
195  * ============
196  *
197  * uuid_mutex
198  *   volume_mutex
199  *     device_list_mutex
200  *       chunk_mutex
201  *     balance_mutex
202  *
203  *
204  * Exclusive operations, BTRFS_FS_EXCL_OP
205  * ======================================
206  *
207  * Maintains the exclusivity of the following operations that apply to the
208  * whole filesystem and cannot run in parallel.
209  *
210  * - Balance (*)
211  * - Device add
212  * - Device remove
213  * - Device replace (*)
214  * - Resize
215  *
216  * The device operations (as above) can be in one of the following states:
217  *
218  * - Running state
219  * - Paused state
220  * - Completed state
221  *
222  * Only device operations marked with (*) can go into the Paused state for the
223  * following reasons:
224  *
225  * - ioctl (only Balance can be Paused through ioctl)
226  * - filesystem remounted as read-only
227  * - filesystem unmounted and mounted as read-only
228  * - system power-cycle and filesystem mounted as read-only
229  * - filesystem or device errors leading to forced read-only
230  *
231  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
232  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
233  * A device operation in Paused or Running state can be canceled or resumed
234  * either by ioctl (Balance only) or when remounted as read-write.
235  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
236  * completed.
237  */
238
239 DEFINE_MUTEX(uuid_mutex);
240 static LIST_HEAD(fs_uuids);
241 struct list_head *btrfs_get_fs_uuids(void)
242 {
243         return &fs_uuids;
244 }
245
246 /*
247  * alloc_fs_devices - allocate struct btrfs_fs_devices
248  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
249  *
250  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
251  * The returned struct is not linked onto any lists and can be destroyed with
252  * kfree() right away.
253  */
254 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
255 {
256         struct btrfs_fs_devices *fs_devs;
257
258         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
259         if (!fs_devs)
260                 return ERR_PTR(-ENOMEM);
261
262         mutex_init(&fs_devs->device_list_mutex);
263
264         INIT_LIST_HEAD(&fs_devs->devices);
265         INIT_LIST_HEAD(&fs_devs->resized_devices);
266         INIT_LIST_HEAD(&fs_devs->alloc_list);
267         INIT_LIST_HEAD(&fs_devs->fs_list);
268         if (fsid)
269                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
270
271         return fs_devs;
272 }
273
274 void btrfs_free_device(struct btrfs_device *device)
275 {
276         rcu_string_free(device->name);
277         bio_put(device->flush_bio);
278         kfree(device);
279 }
280
281 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
282 {
283         struct btrfs_device *device;
284         WARN_ON(fs_devices->opened);
285         while (!list_empty(&fs_devices->devices)) {
286                 device = list_entry(fs_devices->devices.next,
287                                     struct btrfs_device, dev_list);
288                 list_del(&device->dev_list);
289                 btrfs_free_device(device);
290         }
291         kfree(fs_devices);
292 }
293
294 static void btrfs_kobject_uevent(struct block_device *bdev,
295                                  enum kobject_action action)
296 {
297         int ret;
298
299         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
300         if (ret)
301                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
302                         action,
303                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
304                         &disk_to_dev(bdev->bd_disk)->kobj);
305 }
306
307 void __exit btrfs_cleanup_fs_uuids(void)
308 {
309         struct btrfs_fs_devices *fs_devices;
310
311         while (!list_empty(&fs_uuids)) {
312                 fs_devices = list_entry(fs_uuids.next,
313                                         struct btrfs_fs_devices, fs_list);
314                 list_del(&fs_devices->fs_list);
315                 free_fs_devices(fs_devices);
316         }
317 }
318
319 /*
320  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
321  * Returned struct is not linked onto any lists and must be destroyed using
322  * btrfs_free_device.
323  */
324 static struct btrfs_device *__alloc_device(void)
325 {
326         struct btrfs_device *dev;
327
328         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
329         if (!dev)
330                 return ERR_PTR(-ENOMEM);
331
332         /*
333          * Preallocate a bio that's always going to be used for flushing device
334          * barriers and matches the device lifespan
335          */
336         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
337         if (!dev->flush_bio) {
338                 kfree(dev);
339                 return ERR_PTR(-ENOMEM);
340         }
341
342         INIT_LIST_HEAD(&dev->dev_list);
343         INIT_LIST_HEAD(&dev->dev_alloc_list);
344         INIT_LIST_HEAD(&dev->resized_list);
345
346         spin_lock_init(&dev->io_lock);
347
348         atomic_set(&dev->reada_in_flight, 0);
349         atomic_set(&dev->dev_stats_ccnt, 0);
350         btrfs_device_data_ordered_init(dev);
351         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
352         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
353
354         return dev;
355 }
356
357 /*
358  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
359  * return NULL.
360  *
361  * If devid and uuid are both specified, the match must be exact, otherwise
362  * only devid is used.
363  */
364 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
365                 u64 devid, const u8 *uuid)
366 {
367         struct btrfs_device *dev;
368
369         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
370                 if (dev->devid == devid &&
371                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
372                         return dev;
373                 }
374         }
375         return NULL;
376 }
377
378 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
379 {
380         struct btrfs_fs_devices *fs_devices;
381
382         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
383                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
384                         return fs_devices;
385         }
386         return NULL;
387 }
388
389 static int
390 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
391                       int flush, struct block_device **bdev,
392                       struct buffer_head **bh)
393 {
394         int ret;
395
396         *bdev = blkdev_get_by_path(device_path, flags, holder);
397
398         if (IS_ERR(*bdev)) {
399                 ret = PTR_ERR(*bdev);
400                 goto error;
401         }
402
403         if (flush)
404                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
405         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
406         if (ret) {
407                 blkdev_put(*bdev, flags);
408                 goto error;
409         }
410         invalidate_bdev(*bdev);
411         *bh = btrfs_read_dev_super(*bdev);
412         if (IS_ERR(*bh)) {
413                 ret = PTR_ERR(*bh);
414                 blkdev_put(*bdev, flags);
415                 goto error;
416         }
417
418         return 0;
419
420 error:
421         *bdev = NULL;
422         *bh = NULL;
423         return ret;
424 }
425
426 static void requeue_list(struct btrfs_pending_bios *pending_bios,
427                         struct bio *head, struct bio *tail)
428 {
429
430         struct bio *old_head;
431
432         old_head = pending_bios->head;
433         pending_bios->head = head;
434         if (pending_bios->tail)
435                 tail->bi_next = old_head;
436         else
437                 pending_bios->tail = tail;
438 }
439
440 /*
441  * we try to collect pending bios for a device so we don't get a large
442  * number of procs sending bios down to the same device.  This greatly
443  * improves the schedulers ability to collect and merge the bios.
444  *
445  * But, it also turns into a long list of bios to process and that is sure
446  * to eventually make the worker thread block.  The solution here is to
447  * make some progress and then put this work struct back at the end of
448  * the list if the block device is congested.  This way, multiple devices
449  * can make progress from a single worker thread.
450  */
451 static noinline void run_scheduled_bios(struct btrfs_device *device)
452 {
453         struct btrfs_fs_info *fs_info = device->fs_info;
454         struct bio *pending;
455         struct backing_dev_info *bdi;
456         struct btrfs_pending_bios *pending_bios;
457         struct bio *tail;
458         struct bio *cur;
459         int again = 0;
460         unsigned long num_run;
461         unsigned long batch_run = 0;
462         unsigned long last_waited = 0;
463         int force_reg = 0;
464         int sync_pending = 0;
465         struct blk_plug plug;
466
467         /*
468          * this function runs all the bios we've collected for
469          * a particular device.  We don't want to wander off to
470          * another device without first sending all of these down.
471          * So, setup a plug here and finish it off before we return
472          */
473         blk_start_plug(&plug);
474
475         bdi = device->bdev->bd_bdi;
476
477 loop:
478         spin_lock(&device->io_lock);
479
480 loop_lock:
481         num_run = 0;
482
483         /* take all the bios off the list at once and process them
484          * later on (without the lock held).  But, remember the
485          * tail and other pointers so the bios can be properly reinserted
486          * into the list if we hit congestion
487          */
488         if (!force_reg && device->pending_sync_bios.head) {
489                 pending_bios = &device->pending_sync_bios;
490                 force_reg = 1;
491         } else {
492                 pending_bios = &device->pending_bios;
493                 force_reg = 0;
494         }
495
496         pending = pending_bios->head;
497         tail = pending_bios->tail;
498         WARN_ON(pending && !tail);
499
500         /*
501          * if pending was null this time around, no bios need processing
502          * at all and we can stop.  Otherwise it'll loop back up again
503          * and do an additional check so no bios are missed.
504          *
505          * device->running_pending is used to synchronize with the
506          * schedule_bio code.
507          */
508         if (device->pending_sync_bios.head == NULL &&
509             device->pending_bios.head == NULL) {
510                 again = 0;
511                 device->running_pending = 0;
512         } else {
513                 again = 1;
514                 device->running_pending = 1;
515         }
516
517         pending_bios->head = NULL;
518         pending_bios->tail = NULL;
519
520         spin_unlock(&device->io_lock);
521
522         while (pending) {
523
524                 rmb();
525                 /* we want to work on both lists, but do more bios on the
526                  * sync list than the regular list
527                  */
528                 if ((num_run > 32 &&
529                     pending_bios != &device->pending_sync_bios &&
530                     device->pending_sync_bios.head) ||
531                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
532                     device->pending_bios.head)) {
533                         spin_lock(&device->io_lock);
534                         requeue_list(pending_bios, pending, tail);
535                         goto loop_lock;
536                 }
537
538                 cur = pending;
539                 pending = pending->bi_next;
540                 cur->bi_next = NULL;
541
542                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
543
544                 /*
545                  * if we're doing the sync list, record that our
546                  * plug has some sync requests on it
547                  *
548                  * If we're doing the regular list and there are
549                  * sync requests sitting around, unplug before
550                  * we add more
551                  */
552                 if (pending_bios == &device->pending_sync_bios) {
553                         sync_pending = 1;
554                 } else if (sync_pending) {
555                         blk_finish_plug(&plug);
556                         blk_start_plug(&plug);
557                         sync_pending = 0;
558                 }
559
560                 btrfsic_submit_bio(cur);
561                 num_run++;
562                 batch_run++;
563
564                 cond_resched();
565
566                 /*
567                  * we made progress, there is more work to do and the bdi
568                  * is now congested.  Back off and let other work structs
569                  * run instead
570                  */
571                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
572                     fs_info->fs_devices->open_devices > 1) {
573                         struct io_context *ioc;
574
575                         ioc = current->io_context;
576
577                         /*
578                          * the main goal here is that we don't want to
579                          * block if we're going to be able to submit
580                          * more requests without blocking.
581                          *
582                          * This code does two great things, it pokes into
583                          * the elevator code from a filesystem _and_
584                          * it makes assumptions about how batching works.
585                          */
586                         if (ioc && ioc->nr_batch_requests > 0 &&
587                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
588                             (last_waited == 0 ||
589                              ioc->last_waited == last_waited)) {
590                                 /*
591                                  * we want to go through our batch of
592                                  * requests and stop.  So, we copy out
593                                  * the ioc->last_waited time and test
594                                  * against it before looping
595                                  */
596                                 last_waited = ioc->last_waited;
597                                 cond_resched();
598                                 continue;
599                         }
600                         spin_lock(&device->io_lock);
601                         requeue_list(pending_bios, pending, tail);
602                         device->running_pending = 1;
603
604                         spin_unlock(&device->io_lock);
605                         btrfs_queue_work(fs_info->submit_workers,
606                                          &device->work);
607                         goto done;
608                 }
609         }
610
611         cond_resched();
612         if (again)
613                 goto loop;
614
615         spin_lock(&device->io_lock);
616         if (device->pending_bios.head || device->pending_sync_bios.head)
617                 goto loop_lock;
618         spin_unlock(&device->io_lock);
619
620 done:
621         blk_finish_plug(&plug);
622 }
623
624 static void pending_bios_fn(struct btrfs_work *work)
625 {
626         struct btrfs_device *device;
627
628         device = container_of(work, struct btrfs_device, work);
629         run_scheduled_bios(device);
630 }
631
632 /*
633  *  Search and remove all stale (devices which are not mounted) devices.
634  *  When both inputs are NULL, it will search and release all stale devices.
635  *  path:       Optional. When provided will it release all unmounted devices
636  *              matching this path only.
637  *  skip_dev:   Optional. Will skip this device when searching for the stale
638  *              devices.
639  */
640 static void btrfs_free_stale_devices(const char *path,
641                                      struct btrfs_device *skip_device)
642 {
643         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
644         struct btrfs_device *device, *tmp_device;
645
646         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
647                 mutex_lock(&fs_devices->device_list_mutex);
648                 if (fs_devices->opened) {
649                         mutex_unlock(&fs_devices->device_list_mutex);
650                         continue;
651                 }
652
653                 list_for_each_entry_safe(device, tmp_device,
654                                          &fs_devices->devices, dev_list) {
655                         int not_found = 0;
656
657                         if (skip_device && skip_device == device)
658                                 continue;
659                         if (path && !device->name)
660                                 continue;
661
662                         rcu_read_lock();
663                         if (path)
664                                 not_found = strcmp(rcu_str_deref(device->name),
665                                                    path);
666                         rcu_read_unlock();
667                         if (not_found)
668                                 continue;
669
670                         /* delete the stale device */
671                         fs_devices->num_devices--;
672                         list_del(&device->dev_list);
673                         btrfs_free_device(device);
674
675                         if (fs_devices->num_devices == 0)
676                                 break;
677                 }
678                 mutex_unlock(&fs_devices->device_list_mutex);
679                 if (fs_devices->num_devices == 0) {
680                         btrfs_sysfs_remove_fsid(fs_devices);
681                         list_del(&fs_devices->fs_list);
682                         free_fs_devices(fs_devices);
683                 }
684         }
685 }
686
687 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
688                         struct btrfs_device *device, fmode_t flags,
689                         void *holder)
690 {
691         struct request_queue *q;
692         struct block_device *bdev;
693         struct buffer_head *bh;
694         struct btrfs_super_block *disk_super;
695         u64 devid;
696         int ret;
697
698         if (device->bdev)
699                 return -EINVAL;
700         if (!device->name)
701                 return -EINVAL;
702
703         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
704                                     &bdev, &bh);
705         if (ret)
706                 return ret;
707
708         disk_super = (struct btrfs_super_block *)bh->b_data;
709         devid = btrfs_stack_device_id(&disk_super->dev_item);
710         if (devid != device->devid)
711                 goto error_brelse;
712
713         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
714                 goto error_brelse;
715
716         device->generation = btrfs_super_generation(disk_super);
717
718         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
719                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
720                 fs_devices->seeding = 1;
721         } else {
722                 if (bdev_read_only(bdev))
723                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
724                 else
725                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
726         }
727
728         q = bdev_get_queue(bdev);
729         if (!blk_queue_nonrot(q))
730                 fs_devices->rotating = 1;
731
732         device->bdev = bdev;
733         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
734         device->mode = flags;
735
736         fs_devices->open_devices++;
737         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
738             device->devid != BTRFS_DEV_REPLACE_DEVID) {
739                 fs_devices->rw_devices++;
740                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
741         }
742         brelse(bh);
743
744         return 0;
745
746 error_brelse:
747         brelse(bh);
748         blkdev_put(bdev, flags);
749
750         return -EINVAL;
751 }
752
753 /*
754  * Add new device to list of registered devices
755  *
756  * Returns:
757  * device pointer which was just added or updated when successful
758  * error pointer when failed
759  */
760 static noinline struct btrfs_device *device_list_add(const char *path,
761                            struct btrfs_super_block *disk_super,
762                            bool *new_device_added)
763 {
764         struct btrfs_device *device;
765         struct btrfs_fs_devices *fs_devices;
766         struct rcu_string *name;
767         u64 found_transid = btrfs_super_generation(disk_super);
768         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
769
770         fs_devices = find_fsid(disk_super->fsid);
771         if (!fs_devices) {
772                 fs_devices = alloc_fs_devices(disk_super->fsid);
773                 if (IS_ERR(fs_devices))
774                         return ERR_CAST(fs_devices);
775
776                 mutex_lock(&fs_devices->device_list_mutex);
777                 list_add(&fs_devices->fs_list, &fs_uuids);
778
779                 device = NULL;
780         } else {
781                 mutex_lock(&fs_devices->device_list_mutex);
782                 device = find_device(fs_devices, devid,
783                                 disk_super->dev_item.uuid);
784         }
785
786         if (!device) {
787                 if (fs_devices->opened) {
788                         mutex_unlock(&fs_devices->device_list_mutex);
789                         return ERR_PTR(-EBUSY);
790                 }
791
792                 device = btrfs_alloc_device(NULL, &devid,
793                                             disk_super->dev_item.uuid);
794                 if (IS_ERR(device)) {
795                         mutex_unlock(&fs_devices->device_list_mutex);
796                         /* we can safely leave the fs_devices entry around */
797                         return device;
798                 }
799
800                 name = rcu_string_strdup(path, GFP_NOFS);
801                 if (!name) {
802                         btrfs_free_device(device);
803                         mutex_unlock(&fs_devices->device_list_mutex);
804                         return ERR_PTR(-ENOMEM);
805                 }
806                 rcu_assign_pointer(device->name, name);
807
808                 list_add_rcu(&device->dev_list, &fs_devices->devices);
809                 fs_devices->num_devices++;
810
811                 device->fs_devices = fs_devices;
812                 *new_device_added = true;
813
814                 if (disk_super->label[0])
815                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
816                                 disk_super->label, devid, found_transid, path);
817                 else
818                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
819                                 disk_super->fsid, devid, found_transid, path);
820
821         } else if (!device->name || strcmp(device->name->str, path)) {
822                 /*
823                  * When FS is already mounted.
824                  * 1. If you are here and if the device->name is NULL that
825                  *    means this device was missing at time of FS mount.
826                  * 2. If you are here and if the device->name is different
827                  *    from 'path' that means either
828                  *      a. The same device disappeared and reappeared with
829                  *         different name. or
830                  *      b. The missing-disk-which-was-replaced, has
831                  *         reappeared now.
832                  *
833                  * We must allow 1 and 2a above. But 2b would be a spurious
834                  * and unintentional.
835                  *
836                  * Further in case of 1 and 2a above, the disk at 'path'
837                  * would have missed some transaction when it was away and
838                  * in case of 2a the stale bdev has to be updated as well.
839                  * 2b must not be allowed at all time.
840                  */
841
842                 /*
843                  * For now, we do allow update to btrfs_fs_device through the
844                  * btrfs dev scan cli after FS has been mounted.  We're still
845                  * tracking a problem where systems fail mount by subvolume id
846                  * when we reject replacement on a mounted FS.
847                  */
848                 if (!fs_devices->opened && found_transid < device->generation) {
849                         /*
850                          * That is if the FS is _not_ mounted and if you
851                          * are here, that means there is more than one
852                          * disk with same uuid and devid.We keep the one
853                          * with larger generation number or the last-in if
854                          * generation are equal.
855                          */
856                         mutex_unlock(&fs_devices->device_list_mutex);
857                         return ERR_PTR(-EEXIST);
858                 }
859
860                 /*
861                  * We are going to replace the device path for a given devid,
862                  * make sure it's the same device if the device is mounted
863                  */
864                 if (device->bdev) {
865                         struct block_device *path_bdev;
866
867                         path_bdev = lookup_bdev(path);
868                         if (IS_ERR(path_bdev)) {
869                                 mutex_unlock(&fs_devices->device_list_mutex);
870                                 return ERR_CAST(path_bdev);
871                         }
872
873                         if (device->bdev != path_bdev) {
874                                 bdput(path_bdev);
875                                 mutex_unlock(&fs_devices->device_list_mutex);
876                                 btrfs_warn_in_rcu(device->fs_info,
877                         "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
878                                         disk_super->fsid, devid,
879                                         rcu_str_deref(device->name), path);
880                                 return ERR_PTR(-EEXIST);
881                         }
882                         bdput(path_bdev);
883                         btrfs_info_in_rcu(device->fs_info,
884                                 "device fsid %pU devid %llu moved old:%s new:%s",
885                                 disk_super->fsid, devid,
886                                 rcu_str_deref(device->name), path);
887                 }
888
889                 name = rcu_string_strdup(path, GFP_NOFS);
890                 if (!name) {
891                         mutex_unlock(&fs_devices->device_list_mutex);
892                         return ERR_PTR(-ENOMEM);
893                 }
894                 rcu_string_free(device->name);
895                 rcu_assign_pointer(device->name, name);
896                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
897                         fs_devices->missing_devices--;
898                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
899                 }
900         }
901
902         /*
903          * Unmount does not free the btrfs_device struct but would zero
904          * generation along with most of the other members. So just update
905          * it back. We need it to pick the disk with largest generation
906          * (as above).
907          */
908         if (!fs_devices->opened)
909                 device->generation = found_transid;
910
911         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
912
913         mutex_unlock(&fs_devices->device_list_mutex);
914         return device;
915 }
916
917 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
918 {
919         struct btrfs_fs_devices *fs_devices;
920         struct btrfs_device *device;
921         struct btrfs_device *orig_dev;
922
923         fs_devices = alloc_fs_devices(orig->fsid);
924         if (IS_ERR(fs_devices))
925                 return fs_devices;
926
927         mutex_lock(&orig->device_list_mutex);
928         fs_devices->total_devices = orig->total_devices;
929
930         /* We have held the volume lock, it is safe to get the devices. */
931         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
932                 struct rcu_string *name;
933
934                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
935                                             orig_dev->uuid);
936                 if (IS_ERR(device))
937                         goto error;
938
939                 /*
940                  * This is ok to do without rcu read locked because we hold the
941                  * uuid mutex so nothing we touch in here is going to disappear.
942                  */
943                 if (orig_dev->name) {
944                         name = rcu_string_strdup(orig_dev->name->str,
945                                         GFP_KERNEL);
946                         if (!name) {
947                                 btrfs_free_device(device);
948                                 goto error;
949                         }
950                         rcu_assign_pointer(device->name, name);
951                 }
952
953                 list_add(&device->dev_list, &fs_devices->devices);
954                 device->fs_devices = fs_devices;
955                 fs_devices->num_devices++;
956         }
957         mutex_unlock(&orig->device_list_mutex);
958         return fs_devices;
959 error:
960         mutex_unlock(&orig->device_list_mutex);
961         free_fs_devices(fs_devices);
962         return ERR_PTR(-ENOMEM);
963 }
964
965 /*
966  * After we have read the system tree and know devids belonging to
967  * this filesystem, remove the device which does not belong there.
968  */
969 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
970 {
971         struct btrfs_device *device, *next;
972         struct btrfs_device *latest_dev = NULL;
973
974         mutex_lock(&uuid_mutex);
975 again:
976         /* This is the initialized path, it is safe to release the devices. */
977         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
978                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
979                                                         &device->dev_state)) {
980                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
981                              &device->dev_state) &&
982                              (!latest_dev ||
983                               device->generation > latest_dev->generation)) {
984                                 latest_dev = device;
985                         }
986                         continue;
987                 }
988
989                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
990                         /*
991                          * In the first step, keep the device which has
992                          * the correct fsid and the devid that is used
993                          * for the dev_replace procedure.
994                          * In the second step, the dev_replace state is
995                          * read from the device tree and it is known
996                          * whether the procedure is really active or
997                          * not, which means whether this device is
998                          * used or whether it should be removed.
999                          */
1000                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1001                                                   &device->dev_state)) {
1002                                 continue;
1003                         }
1004                 }
1005                 if (device->bdev) {
1006                         blkdev_put(device->bdev, device->mode);
1007                         device->bdev = NULL;
1008                         fs_devices->open_devices--;
1009                 }
1010                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1011                         list_del_init(&device->dev_alloc_list);
1012                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1013                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1014                                       &device->dev_state))
1015                                 fs_devices->rw_devices--;
1016                 }
1017                 list_del_init(&device->dev_list);
1018                 fs_devices->num_devices--;
1019                 btrfs_free_device(device);
1020         }
1021
1022         if (fs_devices->seed) {
1023                 fs_devices = fs_devices->seed;
1024                 goto again;
1025         }
1026
1027         fs_devices->latest_bdev = latest_dev->bdev;
1028
1029         mutex_unlock(&uuid_mutex);
1030 }
1031
1032 static void free_device_rcu(struct rcu_head *head)
1033 {
1034         struct btrfs_device *device;
1035
1036         device = container_of(head, struct btrfs_device, rcu);
1037         btrfs_free_device(device);
1038 }
1039
1040 static void btrfs_close_bdev(struct btrfs_device *device)
1041 {
1042         if (!device->bdev)
1043                 return;
1044
1045         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1046                 sync_blockdev(device->bdev);
1047                 invalidate_bdev(device->bdev);
1048         }
1049
1050         blkdev_put(device->bdev, device->mode);
1051 }
1052
1053 static void btrfs_close_one_device(struct btrfs_device *device)
1054 {
1055         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1056         struct btrfs_device *new_device;
1057         struct rcu_string *name;
1058
1059         if (device->bdev)
1060                 fs_devices->open_devices--;
1061
1062         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1063             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1064                 list_del_init(&device->dev_alloc_list);
1065                 fs_devices->rw_devices--;
1066         }
1067
1068         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1069                 fs_devices->missing_devices--;
1070
1071         btrfs_close_bdev(device);
1072
1073         new_device = btrfs_alloc_device(NULL, &device->devid,
1074                                         device->uuid);
1075         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1076
1077         /* Safe because we are under uuid_mutex */
1078         if (device->name) {
1079                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1080                 BUG_ON(!name); /* -ENOMEM */
1081                 rcu_assign_pointer(new_device->name, name);
1082         }
1083
1084         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1085         new_device->fs_devices = device->fs_devices;
1086
1087         call_rcu(&device->rcu, free_device_rcu);
1088 }
1089
1090 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1091 {
1092         struct btrfs_device *device, *tmp;
1093
1094         if (--fs_devices->opened > 0)
1095                 return 0;
1096
1097         mutex_lock(&fs_devices->device_list_mutex);
1098         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1099                 btrfs_close_one_device(device);
1100         }
1101         mutex_unlock(&fs_devices->device_list_mutex);
1102
1103         WARN_ON(fs_devices->open_devices);
1104         WARN_ON(fs_devices->rw_devices);
1105         fs_devices->opened = 0;
1106         fs_devices->seeding = 0;
1107
1108         return 0;
1109 }
1110
1111 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1112 {
1113         struct btrfs_fs_devices *seed_devices = NULL;
1114         int ret;
1115
1116         mutex_lock(&uuid_mutex);
1117         ret = close_fs_devices(fs_devices);
1118         if (!fs_devices->opened) {
1119                 seed_devices = fs_devices->seed;
1120                 fs_devices->seed = NULL;
1121         }
1122         mutex_unlock(&uuid_mutex);
1123
1124         while (seed_devices) {
1125                 fs_devices = seed_devices;
1126                 seed_devices = fs_devices->seed;
1127                 close_fs_devices(fs_devices);
1128                 free_fs_devices(fs_devices);
1129         }
1130         return ret;
1131 }
1132
1133 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1134                                 fmode_t flags, void *holder)
1135 {
1136         struct btrfs_device *device;
1137         struct btrfs_device *latest_dev = NULL;
1138         int ret = 0;
1139
1140         flags |= FMODE_EXCL;
1141
1142         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1143                 /* Just open everything we can; ignore failures here */
1144                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1145                         continue;
1146
1147                 if (!latest_dev ||
1148                     device->generation > latest_dev->generation)
1149                         latest_dev = device;
1150         }
1151         if (fs_devices->open_devices == 0) {
1152                 ret = -EINVAL;
1153                 goto out;
1154         }
1155         fs_devices->opened = 1;
1156         fs_devices->latest_bdev = latest_dev->bdev;
1157         fs_devices->total_rw_bytes = 0;
1158 out:
1159         return ret;
1160 }
1161
1162 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1163 {
1164         struct btrfs_device *dev1, *dev2;
1165
1166         dev1 = list_entry(a, struct btrfs_device, dev_list);
1167         dev2 = list_entry(b, struct btrfs_device, dev_list);
1168
1169         if (dev1->devid < dev2->devid)
1170                 return -1;
1171         else if (dev1->devid > dev2->devid)
1172                 return 1;
1173         return 0;
1174 }
1175
1176 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1177                        fmode_t flags, void *holder)
1178 {
1179         int ret;
1180
1181         lockdep_assert_held(&uuid_mutex);
1182
1183         mutex_lock(&fs_devices->device_list_mutex);
1184         if (fs_devices->opened) {
1185                 fs_devices->opened++;
1186                 ret = 0;
1187         } else {
1188                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1189                 ret = open_fs_devices(fs_devices, flags, holder);
1190         }
1191         mutex_unlock(&fs_devices->device_list_mutex);
1192
1193         return ret;
1194 }
1195
1196 static void btrfs_release_disk_super(struct page *page)
1197 {
1198         kunmap(page);
1199         put_page(page);
1200 }
1201
1202 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1203                                  struct page **page,
1204                                  struct btrfs_super_block **disk_super)
1205 {
1206         void *p;
1207         pgoff_t index;
1208
1209         /* make sure our super fits in the device */
1210         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1211                 return 1;
1212
1213         /* make sure our super fits in the page */
1214         if (sizeof(**disk_super) > PAGE_SIZE)
1215                 return 1;
1216
1217         /* make sure our super doesn't straddle pages on disk */
1218         index = bytenr >> PAGE_SHIFT;
1219         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1220                 return 1;
1221
1222         /* pull in the page with our super */
1223         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1224                                    index, GFP_KERNEL);
1225
1226         if (IS_ERR_OR_NULL(*page))
1227                 return 1;
1228
1229         p = kmap(*page);
1230
1231         /* align our pointer to the offset of the super block */
1232         *disk_super = p + (bytenr & ~PAGE_MASK);
1233
1234         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1235             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1236                 btrfs_release_disk_super(*page);
1237                 return 1;
1238         }
1239
1240         if ((*disk_super)->label[0] &&
1241                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1242                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1243
1244         return 0;
1245 }
1246
1247 /*
1248  * Look for a btrfs signature on a device. This may be called out of the mount path
1249  * and we are not allowed to call set_blocksize during the scan. The superblock
1250  * is read via pagecache
1251  */
1252 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1253                                            void *holder)
1254 {
1255         struct btrfs_super_block *disk_super;
1256         bool new_device_added = false;
1257         struct btrfs_device *device = NULL;
1258         struct block_device *bdev;
1259         struct page *page;
1260         u64 bytenr;
1261
1262         lockdep_assert_held(&uuid_mutex);
1263
1264         /*
1265          * we would like to check all the supers, but that would make
1266          * a btrfs mount succeed after a mkfs from a different FS.
1267          * So, we need to add a special mount option to scan for
1268          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1269          */
1270         bytenr = btrfs_sb_offset(0);
1271         flags |= FMODE_EXCL;
1272
1273         bdev = blkdev_get_by_path(path, flags, holder);
1274         if (IS_ERR(bdev))
1275                 return ERR_CAST(bdev);
1276
1277         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1278                 device = ERR_PTR(-EINVAL);
1279                 goto error_bdev_put;
1280         }
1281
1282         device = device_list_add(path, disk_super, &new_device_added);
1283         if (!IS_ERR(device)) {
1284                 if (new_device_added)
1285                         btrfs_free_stale_devices(path, device);
1286         }
1287
1288         btrfs_release_disk_super(page);
1289
1290 error_bdev_put:
1291         blkdev_put(bdev, flags);
1292
1293         return device;
1294 }
1295
1296 static int contains_pending_extent(struct btrfs_transaction *transaction,
1297                                    struct btrfs_device *device,
1298                                    u64 *start, u64 len)
1299 {
1300         struct btrfs_fs_info *fs_info = device->fs_info;
1301         struct extent_map *em;
1302         struct list_head *search_list = &fs_info->pinned_chunks;
1303         int ret = 0;
1304         u64 physical_start = *start;
1305
1306         if (transaction)
1307                 search_list = &transaction->pending_chunks;
1308 again:
1309         list_for_each_entry(em, search_list, list) {
1310                 struct map_lookup *map;
1311                 int i;
1312
1313                 map = em->map_lookup;
1314                 for (i = 0; i < map->num_stripes; i++) {
1315                         u64 end;
1316
1317                         if (map->stripes[i].dev != device)
1318                                 continue;
1319                         if (map->stripes[i].physical >= physical_start + len ||
1320                             map->stripes[i].physical + em->orig_block_len <=
1321                             physical_start)
1322                                 continue;
1323                         /*
1324                          * Make sure that while processing the pinned list we do
1325                          * not override our *start with a lower value, because
1326                          * we can have pinned chunks that fall within this
1327                          * device hole and that have lower physical addresses
1328                          * than the pending chunks we processed before. If we
1329                          * do not take this special care we can end up getting
1330                          * 2 pending chunks that start at the same physical
1331                          * device offsets because the end offset of a pinned
1332                          * chunk can be equal to the start offset of some
1333                          * pending chunk.
1334                          */
1335                         end = map->stripes[i].physical + em->orig_block_len;
1336                         if (end > *start) {
1337                                 *start = end;
1338                                 ret = 1;
1339                         }
1340                 }
1341         }
1342         if (search_list != &fs_info->pinned_chunks) {
1343                 search_list = &fs_info->pinned_chunks;
1344                 goto again;
1345         }
1346
1347         return ret;
1348 }
1349
1350
1351 /*
1352  * find_free_dev_extent_start - find free space in the specified device
1353  * @device:       the device which we search the free space in
1354  * @num_bytes:    the size of the free space that we need
1355  * @search_start: the position from which to begin the search
1356  * @start:        store the start of the free space.
1357  * @len:          the size of the free space. that we find, or the size
1358  *                of the max free space if we don't find suitable free space
1359  *
1360  * this uses a pretty simple search, the expectation is that it is
1361  * called very infrequently and that a given device has a small number
1362  * of extents
1363  *
1364  * @start is used to store the start of the free space if we find. But if we
1365  * don't find suitable free space, it will be used to store the start position
1366  * of the max free space.
1367  *
1368  * @len is used to store the size of the free space that we find.
1369  * But if we don't find suitable free space, it is used to store the size of
1370  * the max free space.
1371  */
1372 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1373                                struct btrfs_device *device, u64 num_bytes,
1374                                u64 search_start, u64 *start, u64 *len)
1375 {
1376         struct btrfs_fs_info *fs_info = device->fs_info;
1377         struct btrfs_root *root = fs_info->dev_root;
1378         struct btrfs_key key;
1379         struct btrfs_dev_extent *dev_extent;
1380         struct btrfs_path *path;
1381         u64 hole_size;
1382         u64 max_hole_start;
1383         u64 max_hole_size;
1384         u64 extent_end;
1385         u64 search_end = device->total_bytes;
1386         int ret;
1387         int slot;
1388         struct extent_buffer *l;
1389
1390         /*
1391          * We don't want to overwrite the superblock on the drive nor any area
1392          * used by the boot loader (grub for example), so we make sure to start
1393          * at an offset of at least 1MB.
1394          */
1395         search_start = max_t(u64, search_start, SZ_1M);
1396
1397         path = btrfs_alloc_path();
1398         if (!path)
1399                 return -ENOMEM;
1400
1401         max_hole_start = search_start;
1402         max_hole_size = 0;
1403
1404 again:
1405         if (search_start >= search_end ||
1406                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1407                 ret = -ENOSPC;
1408                 goto out;
1409         }
1410
1411         path->reada = READA_FORWARD;
1412         path->search_commit_root = 1;
1413         path->skip_locking = 1;
1414
1415         key.objectid = device->devid;
1416         key.offset = search_start;
1417         key.type = BTRFS_DEV_EXTENT_KEY;
1418
1419         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1420         if (ret < 0)
1421                 goto out;
1422         if (ret > 0) {
1423                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1424                 if (ret < 0)
1425                         goto out;
1426         }
1427
1428         while (1) {
1429                 l = path->nodes[0];
1430                 slot = path->slots[0];
1431                 if (slot >= btrfs_header_nritems(l)) {
1432                         ret = btrfs_next_leaf(root, path);
1433                         if (ret == 0)
1434                                 continue;
1435                         if (ret < 0)
1436                                 goto out;
1437
1438                         break;
1439                 }
1440                 btrfs_item_key_to_cpu(l, &key, slot);
1441
1442                 if (key.objectid < device->devid)
1443                         goto next;
1444
1445                 if (key.objectid > device->devid)
1446                         break;
1447
1448                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1449                         goto next;
1450
1451                 if (key.offset > search_start) {
1452                         hole_size = key.offset - search_start;
1453
1454                         /*
1455                          * Have to check before we set max_hole_start, otherwise
1456                          * we could end up sending back this offset anyway.
1457                          */
1458                         if (contains_pending_extent(transaction, device,
1459                                                     &search_start,
1460                                                     hole_size)) {
1461                                 if (key.offset >= search_start) {
1462                                         hole_size = key.offset - search_start;
1463                                 } else {
1464                                         WARN_ON_ONCE(1);
1465                                         hole_size = 0;
1466                                 }
1467                         }
1468
1469                         if (hole_size > max_hole_size) {
1470                                 max_hole_start = search_start;
1471                                 max_hole_size = hole_size;
1472                         }
1473
1474                         /*
1475                          * If this free space is greater than which we need,
1476                          * it must be the max free space that we have found
1477                          * until now, so max_hole_start must point to the start
1478                          * of this free space and the length of this free space
1479                          * is stored in max_hole_size. Thus, we return
1480                          * max_hole_start and max_hole_size and go back to the
1481                          * caller.
1482                          */
1483                         if (hole_size >= num_bytes) {
1484                                 ret = 0;
1485                                 goto out;
1486                         }
1487                 }
1488
1489                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1490                 extent_end = key.offset + btrfs_dev_extent_length(l,
1491                                                                   dev_extent);
1492                 if (extent_end > search_start)
1493                         search_start = extent_end;
1494 next:
1495                 path->slots[0]++;
1496                 cond_resched();
1497         }
1498
1499         /*
1500          * At this point, search_start should be the end of
1501          * allocated dev extents, and when shrinking the device,
1502          * search_end may be smaller than search_start.
1503          */
1504         if (search_end > search_start) {
1505                 hole_size = search_end - search_start;
1506
1507                 if (contains_pending_extent(transaction, device, &search_start,
1508                                             hole_size)) {
1509                         btrfs_release_path(path);
1510                         goto again;
1511                 }
1512
1513                 if (hole_size > max_hole_size) {
1514                         max_hole_start = search_start;
1515                         max_hole_size = hole_size;
1516                 }
1517         }
1518
1519         /* See above. */
1520         if (max_hole_size < num_bytes)
1521                 ret = -ENOSPC;
1522         else
1523                 ret = 0;
1524
1525 out:
1526         btrfs_free_path(path);
1527         *start = max_hole_start;
1528         if (len)
1529                 *len = max_hole_size;
1530         return ret;
1531 }
1532
1533 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1534                          struct btrfs_device *device, u64 num_bytes,
1535                          u64 *start, u64 *len)
1536 {
1537         /* FIXME use last free of some kind */
1538         return find_free_dev_extent_start(trans->transaction, device,
1539                                           num_bytes, 0, start, len);
1540 }
1541
1542 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1543                           struct btrfs_device *device,
1544                           u64 start, u64 *dev_extent_len)
1545 {
1546         struct btrfs_fs_info *fs_info = device->fs_info;
1547         struct btrfs_root *root = fs_info->dev_root;
1548         int ret;
1549         struct btrfs_path *path;
1550         struct btrfs_key key;
1551         struct btrfs_key found_key;
1552         struct extent_buffer *leaf = NULL;
1553         struct btrfs_dev_extent *extent = NULL;
1554
1555         path = btrfs_alloc_path();
1556         if (!path)
1557                 return -ENOMEM;
1558
1559         key.objectid = device->devid;
1560         key.offset = start;
1561         key.type = BTRFS_DEV_EXTENT_KEY;
1562 again:
1563         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1564         if (ret > 0) {
1565                 ret = btrfs_previous_item(root, path, key.objectid,
1566                                           BTRFS_DEV_EXTENT_KEY);
1567                 if (ret)
1568                         goto out;
1569                 leaf = path->nodes[0];
1570                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1571                 extent = btrfs_item_ptr(leaf, path->slots[0],
1572                                         struct btrfs_dev_extent);
1573                 BUG_ON(found_key.offset > start || found_key.offset +
1574                        btrfs_dev_extent_length(leaf, extent) < start);
1575                 key = found_key;
1576                 btrfs_release_path(path);
1577                 goto again;
1578         } else if (ret == 0) {
1579                 leaf = path->nodes[0];
1580                 extent = btrfs_item_ptr(leaf, path->slots[0],
1581                                         struct btrfs_dev_extent);
1582         } else {
1583                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1584                 goto out;
1585         }
1586
1587         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1588
1589         ret = btrfs_del_item(trans, root, path);
1590         if (ret) {
1591                 btrfs_handle_fs_error(fs_info, ret,
1592                                       "Failed to remove dev extent item");
1593         } else {
1594                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1595         }
1596 out:
1597         btrfs_free_path(path);
1598         return ret;
1599 }
1600
1601 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1602                                   struct btrfs_device *device,
1603                                   u64 chunk_offset, u64 start, u64 num_bytes)
1604 {
1605         int ret;
1606         struct btrfs_path *path;
1607         struct btrfs_fs_info *fs_info = device->fs_info;
1608         struct btrfs_root *root = fs_info->dev_root;
1609         struct btrfs_dev_extent *extent;
1610         struct extent_buffer *leaf;
1611         struct btrfs_key key;
1612
1613         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1614         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1615         path = btrfs_alloc_path();
1616         if (!path)
1617                 return -ENOMEM;
1618
1619         key.objectid = device->devid;
1620         key.offset = start;
1621         key.type = BTRFS_DEV_EXTENT_KEY;
1622         ret = btrfs_insert_empty_item(trans, root, path, &key,
1623                                       sizeof(*extent));
1624         if (ret)
1625                 goto out;
1626
1627         leaf = path->nodes[0];
1628         extent = btrfs_item_ptr(leaf, path->slots[0],
1629                                 struct btrfs_dev_extent);
1630         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1631                                         BTRFS_CHUNK_TREE_OBJECTID);
1632         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1633                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1634         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1635
1636         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1637         btrfs_mark_buffer_dirty(leaf);
1638 out:
1639         btrfs_free_path(path);
1640         return ret;
1641 }
1642
1643 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1644 {
1645         struct extent_map_tree *em_tree;
1646         struct extent_map *em;
1647         struct rb_node *n;
1648         u64 ret = 0;
1649
1650         em_tree = &fs_info->mapping_tree.map_tree;
1651         read_lock(&em_tree->lock);
1652         n = rb_last(&em_tree->map.rb_root);
1653         if (n) {
1654                 em = rb_entry(n, struct extent_map, rb_node);
1655                 ret = em->start + em->len;
1656         }
1657         read_unlock(&em_tree->lock);
1658
1659         return ret;
1660 }
1661
1662 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1663                                     u64 *devid_ret)
1664 {
1665         int ret;
1666         struct btrfs_key key;
1667         struct btrfs_key found_key;
1668         struct btrfs_path *path;
1669
1670         path = btrfs_alloc_path();
1671         if (!path)
1672                 return -ENOMEM;
1673
1674         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1675         key.type = BTRFS_DEV_ITEM_KEY;
1676         key.offset = (u64)-1;
1677
1678         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1679         if (ret < 0)
1680                 goto error;
1681
1682         BUG_ON(ret == 0); /* Corruption */
1683
1684         ret = btrfs_previous_item(fs_info->chunk_root, path,
1685                                   BTRFS_DEV_ITEMS_OBJECTID,
1686                                   BTRFS_DEV_ITEM_KEY);
1687         if (ret) {
1688                 *devid_ret = 1;
1689         } else {
1690                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1691                                       path->slots[0]);
1692                 *devid_ret = found_key.offset + 1;
1693         }
1694         ret = 0;
1695 error:
1696         btrfs_free_path(path);
1697         return ret;
1698 }
1699
1700 /*
1701  * the device information is stored in the chunk root
1702  * the btrfs_device struct should be fully filled in
1703  */
1704 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1705                             struct btrfs_device *device)
1706 {
1707         int ret;
1708         struct btrfs_path *path;
1709         struct btrfs_dev_item *dev_item;
1710         struct extent_buffer *leaf;
1711         struct btrfs_key key;
1712         unsigned long ptr;
1713
1714         path = btrfs_alloc_path();
1715         if (!path)
1716                 return -ENOMEM;
1717
1718         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1719         key.type = BTRFS_DEV_ITEM_KEY;
1720         key.offset = device->devid;
1721
1722         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1723                                       &key, sizeof(*dev_item));
1724         if (ret)
1725                 goto out;
1726
1727         leaf = path->nodes[0];
1728         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1729
1730         btrfs_set_device_id(leaf, dev_item, device->devid);
1731         btrfs_set_device_generation(leaf, dev_item, 0);
1732         btrfs_set_device_type(leaf, dev_item, device->type);
1733         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1734         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1735         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1736         btrfs_set_device_total_bytes(leaf, dev_item,
1737                                      btrfs_device_get_disk_total_bytes(device));
1738         btrfs_set_device_bytes_used(leaf, dev_item,
1739                                     btrfs_device_get_bytes_used(device));
1740         btrfs_set_device_group(leaf, dev_item, 0);
1741         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1742         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1743         btrfs_set_device_start_offset(leaf, dev_item, 0);
1744
1745         ptr = btrfs_device_uuid(dev_item);
1746         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1747         ptr = btrfs_device_fsid(dev_item);
1748         write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1749         btrfs_mark_buffer_dirty(leaf);
1750
1751         ret = 0;
1752 out:
1753         btrfs_free_path(path);
1754         return ret;
1755 }
1756
1757 /*
1758  * Function to update ctime/mtime for a given device path.
1759  * Mainly used for ctime/mtime based probe like libblkid.
1760  */
1761 static void update_dev_time(const char *path_name)
1762 {
1763         struct file *filp;
1764
1765         filp = filp_open(path_name, O_RDWR, 0);
1766         if (IS_ERR(filp))
1767                 return;
1768         file_update_time(filp);
1769         filp_close(filp, NULL);
1770 }
1771
1772 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1773                              struct btrfs_device *device)
1774 {
1775         struct btrfs_root *root = fs_info->chunk_root;
1776         int ret;
1777         struct btrfs_path *path;
1778         struct btrfs_key key;
1779         struct btrfs_trans_handle *trans;
1780
1781         path = btrfs_alloc_path();
1782         if (!path)
1783                 return -ENOMEM;
1784
1785         trans = btrfs_start_transaction(root, 0);
1786         if (IS_ERR(trans)) {
1787                 btrfs_free_path(path);
1788                 return PTR_ERR(trans);
1789         }
1790         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1791         key.type = BTRFS_DEV_ITEM_KEY;
1792         key.offset = device->devid;
1793
1794         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1795         if (ret) {
1796                 if (ret > 0)
1797                         ret = -ENOENT;
1798                 btrfs_abort_transaction(trans, ret);
1799                 btrfs_end_transaction(trans);
1800                 goto out;
1801         }
1802
1803         ret = btrfs_del_item(trans, root, path);
1804         if (ret) {
1805                 btrfs_abort_transaction(trans, ret);
1806                 btrfs_end_transaction(trans);
1807         }
1808
1809 out:
1810         btrfs_free_path(path);
1811         if (!ret)
1812                 ret = btrfs_commit_transaction(trans);
1813         return ret;
1814 }
1815
1816 /*
1817  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1818  * filesystem. It's up to the caller to adjust that number regarding eg. device
1819  * replace.
1820  */
1821 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1822                 u64 num_devices)
1823 {
1824         u64 all_avail;
1825         unsigned seq;
1826         int i;
1827
1828         do {
1829                 seq = read_seqbegin(&fs_info->profiles_lock);
1830
1831                 all_avail = fs_info->avail_data_alloc_bits |
1832                             fs_info->avail_system_alloc_bits |
1833                             fs_info->avail_metadata_alloc_bits;
1834         } while (read_seqretry(&fs_info->profiles_lock, seq));
1835
1836         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1837                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1838                         continue;
1839
1840                 if (num_devices < btrfs_raid_array[i].devs_min) {
1841                         int ret = btrfs_raid_array[i].mindev_error;
1842
1843                         if (ret)
1844                                 return ret;
1845                 }
1846         }
1847
1848         return 0;
1849 }
1850
1851 static struct btrfs_device * btrfs_find_next_active_device(
1852                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1853 {
1854         struct btrfs_device *next_device;
1855
1856         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1857                 if (next_device != device &&
1858                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1859                     && next_device->bdev)
1860                         return next_device;
1861         }
1862
1863         return NULL;
1864 }
1865
1866 /*
1867  * Helper function to check if the given device is part of s_bdev / latest_bdev
1868  * and replace it with the provided or the next active device, in the context
1869  * where this function called, there should be always be another device (or
1870  * this_dev) which is active.
1871  */
1872 void btrfs_assign_next_active_device(struct btrfs_device *device,
1873                                      struct btrfs_device *this_dev)
1874 {
1875         struct btrfs_fs_info *fs_info = device->fs_info;
1876         struct btrfs_device *next_device;
1877
1878         if (this_dev)
1879                 next_device = this_dev;
1880         else
1881                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1882                                                                 device);
1883         ASSERT(next_device);
1884
1885         if (fs_info->sb->s_bdev &&
1886                         (fs_info->sb->s_bdev == device->bdev))
1887                 fs_info->sb->s_bdev = next_device->bdev;
1888
1889         if (fs_info->fs_devices->latest_bdev == device->bdev)
1890                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1891 }
1892
1893 /*
1894  * Return btrfs_fs_devices::num_devices excluding the device that's being
1895  * currently replaced.
1896  */
1897 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1898 {
1899         u64 num_devices = fs_info->fs_devices->num_devices;
1900
1901         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1902         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1903                 ASSERT(num_devices > 1);
1904                 num_devices--;
1905         }
1906         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1907
1908         return num_devices;
1909 }
1910
1911 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1912                 u64 devid)
1913 {
1914         struct btrfs_device *device;
1915         struct btrfs_fs_devices *cur_devices;
1916         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1917         u64 num_devices;
1918         int ret = 0;
1919
1920         mutex_lock(&uuid_mutex);
1921
1922         num_devices = btrfs_num_devices(fs_info);
1923
1924         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1925         if (ret)
1926                 goto out;
1927
1928         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
1929
1930         if (IS_ERR(device)) {
1931                 if (PTR_ERR(device) == -ENOENT &&
1932                     strcmp(device_path, "missing") == 0)
1933                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1934                 else
1935                         ret = PTR_ERR(device);
1936                 goto out;
1937         }
1938
1939         if (btrfs_pinned_by_swapfile(fs_info, device)) {
1940                 btrfs_warn_in_rcu(fs_info,
1941                   "cannot remove device %s (devid %llu) due to active swapfile",
1942                                   rcu_str_deref(device->name), device->devid);
1943                 ret = -ETXTBSY;
1944                 goto out;
1945         }
1946
1947         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1948                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1949                 goto out;
1950         }
1951
1952         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1953             fs_info->fs_devices->rw_devices == 1) {
1954                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1955                 goto out;
1956         }
1957
1958         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1959                 mutex_lock(&fs_info->chunk_mutex);
1960                 list_del_init(&device->dev_alloc_list);
1961                 device->fs_devices->rw_devices--;
1962                 mutex_unlock(&fs_info->chunk_mutex);
1963         }
1964
1965         mutex_unlock(&uuid_mutex);
1966         ret = btrfs_shrink_device(device, 0);
1967         mutex_lock(&uuid_mutex);
1968         if (ret)
1969                 goto error_undo;
1970
1971         /*
1972          * TODO: the superblock still includes this device in its num_devices
1973          * counter although write_all_supers() is not locked out. This
1974          * could give a filesystem state which requires a degraded mount.
1975          */
1976         ret = btrfs_rm_dev_item(fs_info, device);
1977         if (ret)
1978                 goto error_undo;
1979
1980         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1981         btrfs_scrub_cancel_dev(fs_info, device);
1982
1983         /*
1984          * the device list mutex makes sure that we don't change
1985          * the device list while someone else is writing out all
1986          * the device supers. Whoever is writing all supers, should
1987          * lock the device list mutex before getting the number of
1988          * devices in the super block (super_copy). Conversely,
1989          * whoever updates the number of devices in the super block
1990          * (super_copy) should hold the device list mutex.
1991          */
1992
1993         /*
1994          * In normal cases the cur_devices == fs_devices. But in case
1995          * of deleting a seed device, the cur_devices should point to
1996          * its own fs_devices listed under the fs_devices->seed.
1997          */
1998         cur_devices = device->fs_devices;
1999         mutex_lock(&fs_devices->device_list_mutex);
2000         list_del_rcu(&device->dev_list);
2001
2002         cur_devices->num_devices--;
2003         cur_devices->total_devices--;
2004         /* Update total_devices of the parent fs_devices if it's seed */
2005         if (cur_devices != fs_devices)
2006                 fs_devices->total_devices--;
2007
2008         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2009                 cur_devices->missing_devices--;
2010
2011         btrfs_assign_next_active_device(device, NULL);
2012
2013         if (device->bdev) {
2014                 cur_devices->open_devices--;
2015                 /* remove sysfs entry */
2016                 btrfs_sysfs_rm_device_link(fs_devices, device);
2017         }
2018
2019         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2020         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2021         mutex_unlock(&fs_devices->device_list_mutex);
2022
2023         /*
2024          * at this point, the device is zero sized and detached from
2025          * the devices list.  All that's left is to zero out the old
2026          * supers and free the device.
2027          */
2028         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2029                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2030
2031         btrfs_close_bdev(device);
2032         call_rcu(&device->rcu, free_device_rcu);
2033
2034         if (cur_devices->open_devices == 0) {
2035                 while (fs_devices) {
2036                         if (fs_devices->seed == cur_devices) {
2037                                 fs_devices->seed = cur_devices->seed;
2038                                 break;
2039                         }
2040                         fs_devices = fs_devices->seed;
2041                 }
2042                 cur_devices->seed = NULL;
2043                 close_fs_devices(cur_devices);
2044                 free_fs_devices(cur_devices);
2045         }
2046
2047 out:
2048         mutex_unlock(&uuid_mutex);
2049         return ret;
2050
2051 error_undo:
2052         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2053                 mutex_lock(&fs_info->chunk_mutex);
2054                 list_add(&device->dev_alloc_list,
2055                          &fs_devices->alloc_list);
2056                 device->fs_devices->rw_devices++;
2057                 mutex_unlock(&fs_info->chunk_mutex);
2058         }
2059         goto out;
2060 }
2061
2062 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2063 {
2064         struct btrfs_fs_devices *fs_devices;
2065
2066         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2067
2068         /*
2069          * in case of fs with no seed, srcdev->fs_devices will point
2070          * to fs_devices of fs_info. However when the dev being replaced is
2071          * a seed dev it will point to the seed's local fs_devices. In short
2072          * srcdev will have its correct fs_devices in both the cases.
2073          */
2074         fs_devices = srcdev->fs_devices;
2075
2076         list_del_rcu(&srcdev->dev_list);
2077         list_del(&srcdev->dev_alloc_list);
2078         fs_devices->num_devices--;
2079         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2080                 fs_devices->missing_devices--;
2081
2082         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2083                 fs_devices->rw_devices--;
2084
2085         if (srcdev->bdev)
2086                 fs_devices->open_devices--;
2087 }
2088
2089 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2090                                       struct btrfs_device *srcdev)
2091 {
2092         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2093
2094         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2095                 /* zero out the old super if it is writable */
2096                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2097         }
2098
2099         btrfs_close_bdev(srcdev);
2100         call_rcu(&srcdev->rcu, free_device_rcu);
2101
2102         /* if this is no devs we rather delete the fs_devices */
2103         if (!fs_devices->num_devices) {
2104                 struct btrfs_fs_devices *tmp_fs_devices;
2105
2106                 /*
2107                  * On a mounted FS, num_devices can't be zero unless it's a
2108                  * seed. In case of a seed device being replaced, the replace
2109                  * target added to the sprout FS, so there will be no more
2110                  * device left under the seed FS.
2111                  */
2112                 ASSERT(fs_devices->seeding);
2113
2114                 tmp_fs_devices = fs_info->fs_devices;
2115                 while (tmp_fs_devices) {
2116                         if (tmp_fs_devices->seed == fs_devices) {
2117                                 tmp_fs_devices->seed = fs_devices->seed;
2118                                 break;
2119                         }
2120                         tmp_fs_devices = tmp_fs_devices->seed;
2121                 }
2122                 fs_devices->seed = NULL;
2123                 close_fs_devices(fs_devices);
2124                 free_fs_devices(fs_devices);
2125         }
2126 }
2127
2128 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2129 {
2130         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2131
2132         WARN_ON(!tgtdev);
2133         mutex_lock(&fs_devices->device_list_mutex);
2134
2135         btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2136
2137         if (tgtdev->bdev)
2138                 fs_devices->open_devices--;
2139
2140         fs_devices->num_devices--;
2141
2142         btrfs_assign_next_active_device(tgtdev, NULL);
2143
2144         list_del_rcu(&tgtdev->dev_list);
2145
2146         mutex_unlock(&fs_devices->device_list_mutex);
2147
2148         /*
2149          * The update_dev_time() with in btrfs_scratch_superblocks()
2150          * may lead to a call to btrfs_show_devname() which will try
2151          * to hold device_list_mutex. And here this device
2152          * is already out of device list, so we don't have to hold
2153          * the device_list_mutex lock.
2154          */
2155         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2156
2157         btrfs_close_bdev(tgtdev);
2158         call_rcu(&tgtdev->rcu, free_device_rcu);
2159 }
2160
2161 static struct btrfs_device *btrfs_find_device_by_path(
2162                 struct btrfs_fs_info *fs_info, const char *device_path)
2163 {
2164         int ret = 0;
2165         struct btrfs_super_block *disk_super;
2166         u64 devid;
2167         u8 *dev_uuid;
2168         struct block_device *bdev;
2169         struct buffer_head *bh;
2170         struct btrfs_device *device;
2171
2172         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2173                                     fs_info->bdev_holder, 0, &bdev, &bh);
2174         if (ret)
2175                 return ERR_PTR(ret);
2176         disk_super = (struct btrfs_super_block *)bh->b_data;
2177         devid = btrfs_stack_device_id(&disk_super->dev_item);
2178         dev_uuid = disk_super->dev_item.uuid;
2179         device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2180         brelse(bh);
2181         if (!device)
2182                 device = ERR_PTR(-ENOENT);
2183         blkdev_put(bdev, FMODE_READ);
2184         return device;
2185 }
2186
2187 static struct btrfs_device *btrfs_find_device_missing_or_by_path(
2188                 struct btrfs_fs_info *fs_info, const char *device_path)
2189 {
2190         struct btrfs_device *device = NULL;
2191         if (strcmp(device_path, "missing") == 0) {
2192                 struct list_head *devices;
2193                 struct btrfs_device *tmp;
2194
2195                 devices = &fs_info->fs_devices->devices;
2196                 list_for_each_entry(tmp, devices, dev_list) {
2197                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2198                                         &tmp->dev_state) && !tmp->bdev) {
2199                                 device = tmp;
2200                                 break;
2201                         }
2202                 }
2203
2204                 if (!device)
2205                         return ERR_PTR(-ENOENT);
2206         } else {
2207                 device = btrfs_find_device_by_path(fs_info, device_path);
2208         }
2209
2210         return device;
2211 }
2212
2213 /*
2214  * Lookup a device given by device id, or the path if the id is 0.
2215  */
2216 struct btrfs_device *btrfs_find_device_by_devspec(
2217                 struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
2218 {
2219         struct btrfs_device *device;
2220
2221         if (devid) {
2222                 device = btrfs_find_device(fs_info, devid, NULL, NULL);
2223                 if (!device)
2224                         return ERR_PTR(-ENOENT);
2225         } else {
2226                 if (!devpath || !devpath[0])
2227                         return ERR_PTR(-EINVAL);
2228                 device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
2229         }
2230         return device;
2231 }
2232
2233 /*
2234  * does all the dirty work required for changing file system's UUID.
2235  */
2236 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2237 {
2238         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2239         struct btrfs_fs_devices *old_devices;
2240         struct btrfs_fs_devices *seed_devices;
2241         struct btrfs_super_block *disk_super = fs_info->super_copy;
2242         struct btrfs_device *device;
2243         u64 super_flags;
2244
2245         lockdep_assert_held(&uuid_mutex);
2246         if (!fs_devices->seeding)
2247                 return -EINVAL;
2248
2249         seed_devices = alloc_fs_devices(NULL);
2250         if (IS_ERR(seed_devices))
2251                 return PTR_ERR(seed_devices);
2252
2253         old_devices = clone_fs_devices(fs_devices);
2254         if (IS_ERR(old_devices)) {
2255                 kfree(seed_devices);
2256                 return PTR_ERR(old_devices);
2257         }
2258
2259         list_add(&old_devices->fs_list, &fs_uuids);
2260
2261         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2262         seed_devices->opened = 1;
2263         INIT_LIST_HEAD(&seed_devices->devices);
2264         INIT_LIST_HEAD(&seed_devices->alloc_list);
2265         mutex_init(&seed_devices->device_list_mutex);
2266
2267         mutex_lock(&fs_devices->device_list_mutex);
2268         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2269                               synchronize_rcu);
2270         list_for_each_entry(device, &seed_devices->devices, dev_list)
2271                 device->fs_devices = seed_devices;
2272
2273         mutex_lock(&fs_info->chunk_mutex);
2274         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2275         mutex_unlock(&fs_info->chunk_mutex);
2276
2277         fs_devices->seeding = 0;
2278         fs_devices->num_devices = 0;
2279         fs_devices->open_devices = 0;
2280         fs_devices->missing_devices = 0;
2281         fs_devices->rotating = 0;
2282         fs_devices->seed = seed_devices;
2283
2284         generate_random_uuid(fs_devices->fsid);
2285         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2286         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2287         mutex_unlock(&fs_devices->device_list_mutex);
2288
2289         super_flags = btrfs_super_flags(disk_super) &
2290                       ~BTRFS_SUPER_FLAG_SEEDING;
2291         btrfs_set_super_flags(disk_super, super_flags);
2292
2293         return 0;
2294 }
2295
2296 /*
2297  * Store the expected generation for seed devices in device items.
2298  */
2299 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2300                                struct btrfs_fs_info *fs_info)
2301 {
2302         struct btrfs_root *root = fs_info->chunk_root;
2303         struct btrfs_path *path;
2304         struct extent_buffer *leaf;
2305         struct btrfs_dev_item *dev_item;
2306         struct btrfs_device *device;
2307         struct btrfs_key key;
2308         u8 fs_uuid[BTRFS_FSID_SIZE];
2309         u8 dev_uuid[BTRFS_UUID_SIZE];
2310         u64 devid;
2311         int ret;
2312
2313         path = btrfs_alloc_path();
2314         if (!path)
2315                 return -ENOMEM;
2316
2317         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2318         key.offset = 0;
2319         key.type = BTRFS_DEV_ITEM_KEY;
2320
2321         while (1) {
2322                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2323                 if (ret < 0)
2324                         goto error;
2325
2326                 leaf = path->nodes[0];
2327 next_slot:
2328                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2329                         ret = btrfs_next_leaf(root, path);
2330                         if (ret > 0)
2331                                 break;
2332                         if (ret < 0)
2333                                 goto error;
2334                         leaf = path->nodes[0];
2335                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2336                         btrfs_release_path(path);
2337                         continue;
2338                 }
2339
2340                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2341                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2342                     key.type != BTRFS_DEV_ITEM_KEY)
2343                         break;
2344
2345                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2346                                           struct btrfs_dev_item);
2347                 devid = btrfs_device_id(leaf, dev_item);
2348                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2349                                    BTRFS_UUID_SIZE);
2350                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2351                                    BTRFS_FSID_SIZE);
2352                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2353                 BUG_ON(!device); /* Logic error */
2354
2355                 if (device->fs_devices->seeding) {
2356                         btrfs_set_device_generation(leaf, dev_item,
2357                                                     device->generation);
2358                         btrfs_mark_buffer_dirty(leaf);
2359                 }
2360
2361                 path->slots[0]++;
2362                 goto next_slot;
2363         }
2364         ret = 0;
2365 error:
2366         btrfs_free_path(path);
2367         return ret;
2368 }
2369
2370 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2371 {
2372         struct btrfs_root *root = fs_info->dev_root;
2373         struct request_queue *q;
2374         struct btrfs_trans_handle *trans;
2375         struct btrfs_device *device;
2376         struct block_device *bdev;
2377         struct super_block *sb = fs_info->sb;
2378         struct rcu_string *name;
2379         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2380         u64 orig_super_total_bytes;
2381         u64 orig_super_num_devices;
2382         int seeding_dev = 0;
2383         int ret = 0;
2384         bool unlocked = false;
2385
2386         if (sb_rdonly(sb) && !fs_devices->seeding)
2387                 return -EROFS;
2388
2389         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2390                                   fs_info->bdev_holder);
2391         if (IS_ERR(bdev))
2392                 return PTR_ERR(bdev);
2393
2394         if (fs_devices->seeding) {
2395                 seeding_dev = 1;
2396                 down_write(&sb->s_umount);
2397                 mutex_lock(&uuid_mutex);
2398         }
2399
2400         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2401
2402         mutex_lock(&fs_devices->device_list_mutex);
2403         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2404                 if (device->bdev == bdev) {
2405                         ret = -EEXIST;
2406                         mutex_unlock(
2407                                 &fs_devices->device_list_mutex);
2408                         goto error;
2409                 }
2410         }
2411         mutex_unlock(&fs_devices->device_list_mutex);
2412
2413         device = btrfs_alloc_device(fs_info, NULL, NULL);
2414         if (IS_ERR(device)) {
2415                 /* we can safely leave the fs_devices entry around */
2416                 ret = PTR_ERR(device);
2417                 goto error;
2418         }
2419
2420         name = rcu_string_strdup(device_path, GFP_KERNEL);
2421         if (!name) {
2422                 ret = -ENOMEM;
2423                 goto error_free_device;
2424         }
2425         rcu_assign_pointer(device->name, name);
2426
2427         trans = btrfs_start_transaction(root, 0);
2428         if (IS_ERR(trans)) {
2429                 ret = PTR_ERR(trans);
2430                 goto error_free_device;
2431         }
2432
2433         q = bdev_get_queue(bdev);
2434         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2435         device->generation = trans->transid;
2436         device->io_width = fs_info->sectorsize;
2437         device->io_align = fs_info->sectorsize;
2438         device->sector_size = fs_info->sectorsize;
2439         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2440                                          fs_info->sectorsize);
2441         device->disk_total_bytes = device->total_bytes;
2442         device->commit_total_bytes = device->total_bytes;
2443         device->fs_info = fs_info;
2444         device->bdev = bdev;
2445         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2446         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2447         device->mode = FMODE_EXCL;
2448         device->dev_stats_valid = 1;
2449         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2450
2451         if (seeding_dev) {
2452                 sb->s_flags &= ~SB_RDONLY;
2453                 ret = btrfs_prepare_sprout(fs_info);
2454                 if (ret) {
2455                         btrfs_abort_transaction(trans, ret);
2456                         goto error_trans;
2457                 }
2458         }
2459
2460         device->fs_devices = fs_devices;
2461
2462         mutex_lock(&fs_devices->device_list_mutex);
2463         mutex_lock(&fs_info->chunk_mutex);
2464         list_add_rcu(&device->dev_list, &fs_devices->devices);
2465         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2466         fs_devices->num_devices++;
2467         fs_devices->open_devices++;
2468         fs_devices->rw_devices++;
2469         fs_devices->total_devices++;
2470         fs_devices->total_rw_bytes += device->total_bytes;
2471
2472         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2473
2474         if (!blk_queue_nonrot(q))
2475                 fs_devices->rotating = 1;
2476
2477         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2478         btrfs_set_super_total_bytes(fs_info->super_copy,
2479                 round_down(orig_super_total_bytes + device->total_bytes,
2480                            fs_info->sectorsize));
2481
2482         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2483         btrfs_set_super_num_devices(fs_info->super_copy,
2484                                     orig_super_num_devices + 1);
2485
2486         /* add sysfs device entry */
2487         btrfs_sysfs_add_device_link(fs_devices, device);
2488
2489         /*
2490          * we've got more storage, clear any full flags on the space
2491          * infos
2492          */
2493         btrfs_clear_space_info_full(fs_info);
2494
2495         mutex_unlock(&fs_info->chunk_mutex);
2496         mutex_unlock(&fs_devices->device_list_mutex);
2497
2498         if (seeding_dev) {
2499                 mutex_lock(&fs_info->chunk_mutex);
2500                 ret = init_first_rw_device(trans, fs_info);
2501                 mutex_unlock(&fs_info->chunk_mutex);
2502                 if (ret) {
2503                         btrfs_abort_transaction(trans, ret);
2504                         goto error_sysfs;
2505                 }
2506         }
2507
2508         ret = btrfs_add_dev_item(trans, device);
2509         if (ret) {
2510                 btrfs_abort_transaction(trans, ret);
2511                 goto error_sysfs;
2512         }
2513
2514         if (seeding_dev) {
2515                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2516
2517                 ret = btrfs_finish_sprout(trans, fs_info);
2518                 if (ret) {
2519                         btrfs_abort_transaction(trans, ret);
2520                         goto error_sysfs;
2521                 }
2522
2523                 /* Sprouting would change fsid of the mounted root,
2524                  * so rename the fsid on the sysfs
2525                  */
2526                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2527                                                 fs_info->fsid);
2528                 if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2529                         btrfs_warn(fs_info,
2530                                    "sysfs: failed to create fsid for sprout");
2531         }
2532
2533         ret = btrfs_commit_transaction(trans);
2534
2535         if (seeding_dev) {
2536                 mutex_unlock(&uuid_mutex);
2537                 up_write(&sb->s_umount);
2538                 unlocked = true;
2539
2540                 if (ret) /* transaction commit */
2541                         return ret;
2542
2543                 ret = btrfs_relocate_sys_chunks(fs_info);
2544                 if (ret < 0)
2545                         btrfs_handle_fs_error(fs_info, ret,
2546                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2547                 trans = btrfs_attach_transaction(root);
2548                 if (IS_ERR(trans)) {
2549                         if (PTR_ERR(trans) == -ENOENT)
2550                                 return 0;
2551                         ret = PTR_ERR(trans);
2552                         trans = NULL;
2553                         goto error_sysfs;
2554                 }
2555                 ret = btrfs_commit_transaction(trans);
2556         }
2557
2558         /* Update ctime/mtime for libblkid */
2559         update_dev_time(device_path);
2560         return ret;
2561
2562 error_sysfs:
2563         btrfs_sysfs_rm_device_link(fs_devices, device);
2564         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2565         mutex_lock(&fs_info->chunk_mutex);
2566         list_del_rcu(&device->dev_list);
2567         list_del(&device->dev_alloc_list);
2568         fs_info->fs_devices->num_devices--;
2569         fs_info->fs_devices->open_devices--;
2570         fs_info->fs_devices->rw_devices--;
2571         fs_info->fs_devices->total_devices--;
2572         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2573         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2574         btrfs_set_super_total_bytes(fs_info->super_copy,
2575                                     orig_super_total_bytes);
2576         btrfs_set_super_num_devices(fs_info->super_copy,
2577                                     orig_super_num_devices);
2578         mutex_unlock(&fs_info->chunk_mutex);
2579         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2580 error_trans:
2581         if (seeding_dev)
2582                 sb->s_flags |= SB_RDONLY;
2583         if (trans)
2584                 btrfs_end_transaction(trans);
2585 error_free_device:
2586         btrfs_free_device(device);
2587 error:
2588         blkdev_put(bdev, FMODE_EXCL);
2589         if (seeding_dev && !unlocked) {
2590                 mutex_unlock(&uuid_mutex);
2591                 up_write(&sb->s_umount);
2592         }
2593         return ret;
2594 }
2595
2596 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2597                                         struct btrfs_device *device)
2598 {
2599         int ret;
2600         struct btrfs_path *path;
2601         struct btrfs_root *root = device->fs_info->chunk_root;
2602         struct btrfs_dev_item *dev_item;
2603         struct extent_buffer *leaf;
2604         struct btrfs_key key;
2605
2606         path = btrfs_alloc_path();
2607         if (!path)
2608                 return -ENOMEM;
2609
2610         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2611         key.type = BTRFS_DEV_ITEM_KEY;
2612         key.offset = device->devid;
2613
2614         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2615         if (ret < 0)
2616                 goto out;
2617
2618         if (ret > 0) {
2619                 ret = -ENOENT;
2620                 goto out;
2621         }
2622
2623         leaf = path->nodes[0];
2624         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2625
2626         btrfs_set_device_id(leaf, dev_item, device->devid);
2627         btrfs_set_device_type(leaf, dev_item, device->type);
2628         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2629         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2630         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2631         btrfs_set_device_total_bytes(leaf, dev_item,
2632                                      btrfs_device_get_disk_total_bytes(device));
2633         btrfs_set_device_bytes_used(leaf, dev_item,
2634                                     btrfs_device_get_bytes_used(device));
2635         btrfs_mark_buffer_dirty(leaf);
2636
2637 out:
2638         btrfs_free_path(path);
2639         return ret;
2640 }
2641
2642 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2643                       struct btrfs_device *device, u64 new_size)
2644 {
2645         struct btrfs_fs_info *fs_info = device->fs_info;
2646         struct btrfs_super_block *super_copy = fs_info->super_copy;
2647         struct btrfs_fs_devices *fs_devices;
2648         u64 old_total;
2649         u64 diff;
2650
2651         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2652                 return -EACCES;
2653
2654         new_size = round_down(new_size, fs_info->sectorsize);
2655
2656         mutex_lock(&fs_info->chunk_mutex);
2657         old_total = btrfs_super_total_bytes(super_copy);
2658         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2659
2660         if (new_size <= device->total_bytes ||
2661             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2662                 mutex_unlock(&fs_info->chunk_mutex);
2663                 return -EINVAL;
2664         }
2665
2666         fs_devices = fs_info->fs_devices;
2667
2668         btrfs_set_super_total_bytes(super_copy,
2669                         round_down(old_total + diff, fs_info->sectorsize));
2670         device->fs_devices->total_rw_bytes += diff;
2671
2672         btrfs_device_set_total_bytes(device, new_size);
2673         btrfs_device_set_disk_total_bytes(device, new_size);
2674         btrfs_clear_space_info_full(device->fs_info);
2675         if (list_empty(&device->resized_list))
2676                 list_add_tail(&device->resized_list,
2677                               &fs_devices->resized_devices);
2678         mutex_unlock(&fs_info->chunk_mutex);
2679
2680         return btrfs_update_device(trans, device);
2681 }
2682
2683 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2684 {
2685         struct btrfs_fs_info *fs_info = trans->fs_info;
2686         struct btrfs_root *root = fs_info->chunk_root;
2687         int ret;
2688         struct btrfs_path *path;
2689         struct btrfs_key key;
2690
2691         path = btrfs_alloc_path();
2692         if (!path)
2693                 return -ENOMEM;
2694
2695         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2696         key.offset = chunk_offset;
2697         key.type = BTRFS_CHUNK_ITEM_KEY;
2698
2699         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2700         if (ret < 0)
2701                 goto out;
2702         else if (ret > 0) { /* Logic error or corruption */
2703                 btrfs_handle_fs_error(fs_info, -ENOENT,
2704                                       "Failed lookup while freeing chunk.");
2705                 ret = -ENOENT;
2706                 goto out;
2707         }
2708
2709         ret = btrfs_del_item(trans, root, path);
2710         if (ret < 0)
2711                 btrfs_handle_fs_error(fs_info, ret,
2712                                       "Failed to delete chunk item.");
2713 out:
2714         btrfs_free_path(path);
2715         return ret;
2716 }
2717
2718 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2719 {
2720         struct btrfs_super_block *super_copy = fs_info->super_copy;
2721         struct btrfs_disk_key *disk_key;
2722         struct btrfs_chunk *chunk;
2723         u8 *ptr;
2724         int ret = 0;
2725         u32 num_stripes;
2726         u32 array_size;
2727         u32 len = 0;
2728         u32 cur;
2729         struct btrfs_key key;
2730
2731         mutex_lock(&fs_info->chunk_mutex);
2732         array_size = btrfs_super_sys_array_size(super_copy);
2733
2734         ptr = super_copy->sys_chunk_array;
2735         cur = 0;
2736
2737         while (cur < array_size) {
2738                 disk_key = (struct btrfs_disk_key *)ptr;
2739                 btrfs_disk_key_to_cpu(&key, disk_key);
2740
2741                 len = sizeof(*disk_key);
2742
2743                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2744                         chunk = (struct btrfs_chunk *)(ptr + len);
2745                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2746                         len += btrfs_chunk_item_size(num_stripes);
2747                 } else {
2748                         ret = -EIO;
2749                         break;
2750                 }
2751                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2752                     key.offset == chunk_offset) {
2753                         memmove(ptr, ptr + len, array_size - (cur + len));
2754                         array_size -= len;
2755                         btrfs_set_super_sys_array_size(super_copy, array_size);
2756                 } else {
2757                         ptr += len;
2758                         cur += len;
2759                 }
2760         }
2761         mutex_unlock(&fs_info->chunk_mutex);
2762         return ret;
2763 }
2764
2765 /*
2766  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2767  * @logical: Logical block offset in bytes.
2768  * @length: Length of extent in bytes.
2769  *
2770  * Return: Chunk mapping or ERR_PTR.
2771  */
2772 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2773                                        u64 logical, u64 length)
2774 {
2775         struct extent_map_tree *em_tree;
2776         struct extent_map *em;
2777
2778         em_tree = &fs_info->mapping_tree.map_tree;
2779         read_lock(&em_tree->lock);
2780         em = lookup_extent_mapping(em_tree, logical, length);
2781         read_unlock(&em_tree->lock);
2782
2783         if (!em) {
2784                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2785                            logical, length);
2786                 return ERR_PTR(-EINVAL);
2787         }
2788
2789         if (em->start > logical || em->start + em->len < logical) {
2790                 btrfs_crit(fs_info,
2791                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2792                            logical, length, em->start, em->start + em->len);
2793                 free_extent_map(em);
2794                 return ERR_PTR(-EINVAL);
2795         }
2796
2797         /* callers are responsible for dropping em's ref. */
2798         return em;
2799 }
2800
2801 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2802 {
2803         struct btrfs_fs_info *fs_info = trans->fs_info;
2804         struct extent_map *em;
2805         struct map_lookup *map;
2806         u64 dev_extent_len = 0;
2807         int i, ret = 0;
2808         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2809
2810         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2811         if (IS_ERR(em)) {
2812                 /*
2813                  * This is a logic error, but we don't want to just rely on the
2814                  * user having built with ASSERT enabled, so if ASSERT doesn't
2815                  * do anything we still error out.
2816                  */
2817                 ASSERT(0);
2818                 return PTR_ERR(em);
2819         }
2820         map = em->map_lookup;
2821         mutex_lock(&fs_info->chunk_mutex);
2822         check_system_chunk(trans, map->type);
2823         mutex_unlock(&fs_info->chunk_mutex);
2824
2825         /*
2826          * Take the device list mutex to prevent races with the final phase of
2827          * a device replace operation that replaces the device object associated
2828          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2829          */
2830         mutex_lock(&fs_devices->device_list_mutex);
2831         for (i = 0; i < map->num_stripes; i++) {
2832                 struct btrfs_device *device = map->stripes[i].dev;
2833                 ret = btrfs_free_dev_extent(trans, device,
2834                                             map->stripes[i].physical,
2835                                             &dev_extent_len);
2836                 if (ret) {
2837                         mutex_unlock(&fs_devices->device_list_mutex);
2838                         btrfs_abort_transaction(trans, ret);
2839                         goto out;
2840                 }
2841
2842                 if (device->bytes_used > 0) {
2843                         mutex_lock(&fs_info->chunk_mutex);
2844                         btrfs_device_set_bytes_used(device,
2845                                         device->bytes_used - dev_extent_len);
2846                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2847                         btrfs_clear_space_info_full(fs_info);
2848                         mutex_unlock(&fs_info->chunk_mutex);
2849                 }
2850
2851                 if (map->stripes[i].dev) {
2852                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2853                         if (ret) {
2854                                 mutex_unlock(&fs_devices->device_list_mutex);
2855                                 btrfs_abort_transaction(trans, ret);
2856                                 goto out;
2857                         }
2858                 }
2859         }
2860         mutex_unlock(&fs_devices->device_list_mutex);
2861
2862         ret = btrfs_free_chunk(trans, chunk_offset);
2863         if (ret) {
2864                 btrfs_abort_transaction(trans, ret);
2865                 goto out;
2866         }
2867
2868         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2869
2870         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2871                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2872                 if (ret) {
2873                         btrfs_abort_transaction(trans, ret);
2874                         goto out;
2875                 }
2876         }
2877
2878         ret = btrfs_remove_block_group(trans, chunk_offset, em);
2879         if (ret) {
2880                 btrfs_abort_transaction(trans, ret);
2881                 goto out;
2882         }
2883
2884 out:
2885         /* once for us */
2886         free_extent_map(em);
2887         return ret;
2888 }
2889
2890 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2891 {
2892         struct btrfs_root *root = fs_info->chunk_root;
2893         struct btrfs_trans_handle *trans;
2894         int ret;
2895
2896         /*
2897          * Prevent races with automatic removal of unused block groups.
2898          * After we relocate and before we remove the chunk with offset
2899          * chunk_offset, automatic removal of the block group can kick in,
2900          * resulting in a failure when calling btrfs_remove_chunk() below.
2901          *
2902          * Make sure to acquire this mutex before doing a tree search (dev
2903          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2904          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2905          * we release the path used to search the chunk/dev tree and before
2906          * the current task acquires this mutex and calls us.
2907          */
2908         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2909
2910         ret = btrfs_can_relocate(fs_info, chunk_offset);
2911         if (ret)
2912                 return -ENOSPC;
2913
2914         /* step one, relocate all the extents inside this chunk */
2915         btrfs_scrub_pause(fs_info);
2916         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2917         btrfs_scrub_continue(fs_info);
2918         if (ret)
2919                 return ret;
2920
2921         /*
2922          * We add the kobjects here (and after forcing data chunk creation)
2923          * since relocation is the only place we'll create chunks of a new
2924          * type at runtime.  The only place where we'll remove the last
2925          * chunk of a type is the call immediately below this one.  Even
2926          * so, we're protected against races with the cleaner thread since
2927          * we're covered by the delete_unused_bgs_mutex.
2928          */
2929         btrfs_add_raid_kobjects(fs_info);
2930
2931         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2932                                                      chunk_offset);
2933         if (IS_ERR(trans)) {
2934                 ret = PTR_ERR(trans);
2935                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2936                 return ret;
2937         }
2938
2939         /*
2940          * step two, delete the device extents and the
2941          * chunk tree entries
2942          */
2943         ret = btrfs_remove_chunk(trans, chunk_offset);
2944         btrfs_end_transaction(trans);
2945         return ret;
2946 }
2947
2948 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2949 {
2950         struct btrfs_root *chunk_root = fs_info->chunk_root;
2951         struct btrfs_path *path;
2952         struct extent_buffer *leaf;
2953         struct btrfs_chunk *chunk;
2954         struct btrfs_key key;
2955         struct btrfs_key found_key;
2956         u64 chunk_type;
2957         bool retried = false;
2958         int failed = 0;
2959         int ret;
2960
2961         path = btrfs_alloc_path();
2962         if (!path)
2963                 return -ENOMEM;
2964
2965 again:
2966         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2967         key.offset = (u64)-1;
2968         key.type = BTRFS_CHUNK_ITEM_KEY;
2969
2970         while (1) {
2971                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2972                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2973                 if (ret < 0) {
2974                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2975                         goto error;
2976                 }
2977                 BUG_ON(ret == 0); /* Corruption */
2978
2979                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2980                                           key.type);
2981                 if (ret)
2982                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2983                 if (ret < 0)
2984                         goto error;
2985                 if (ret > 0)
2986                         break;
2987
2988                 leaf = path->nodes[0];
2989                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2990
2991                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2992                                        struct btrfs_chunk);
2993                 chunk_type = btrfs_chunk_type(leaf, chunk);
2994                 btrfs_release_path(path);
2995
2996                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2997                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
2998                         if (ret == -ENOSPC)
2999                                 failed++;
3000                         else
3001                                 BUG_ON(ret);
3002                 }
3003                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3004
3005                 if (found_key.offset == 0)
3006                         break;
3007                 key.offset = found_key.offset - 1;
3008         }
3009         ret = 0;
3010         if (failed && !retried) {
3011                 failed = 0;
3012                 retried = true;
3013                 goto again;
3014         } else if (WARN_ON(failed && retried)) {
3015                 ret = -ENOSPC;
3016         }
3017 error:
3018         btrfs_free_path(path);
3019         return ret;
3020 }
3021
3022 /*
3023  * return 1 : allocate a data chunk successfully,
3024  * return <0: errors during allocating a data chunk,
3025  * return 0 : no need to allocate a data chunk.
3026  */
3027 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3028                                       u64 chunk_offset)
3029 {
3030         struct btrfs_block_group_cache *cache;
3031         u64 bytes_used;
3032         u64 chunk_type;
3033
3034         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3035         ASSERT(cache);
3036         chunk_type = cache->flags;
3037         btrfs_put_block_group(cache);
3038
3039         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3040                 spin_lock(&fs_info->data_sinfo->lock);
3041                 bytes_used = fs_info->data_sinfo->bytes_used;
3042                 spin_unlock(&fs_info->data_sinfo->lock);
3043
3044                 if (!bytes_used) {
3045                         struct btrfs_trans_handle *trans;
3046                         int ret;
3047
3048                         trans = btrfs_join_transaction(fs_info->tree_root);
3049                         if (IS_ERR(trans))
3050                                 return PTR_ERR(trans);
3051
3052                         ret = btrfs_force_chunk_alloc(trans,
3053                                                       BTRFS_BLOCK_GROUP_DATA);
3054                         btrfs_end_transaction(trans);
3055                         if (ret < 0)
3056                                 return ret;
3057
3058                         btrfs_add_raid_kobjects(fs_info);
3059
3060                         return 1;
3061                 }
3062         }
3063         return 0;
3064 }
3065
3066 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3067                                struct btrfs_balance_control *bctl)
3068 {
3069         struct btrfs_root *root = fs_info->tree_root;
3070         struct btrfs_trans_handle *trans;
3071         struct btrfs_balance_item *item;
3072         struct btrfs_disk_balance_args disk_bargs;
3073         struct btrfs_path *path;
3074         struct extent_buffer *leaf;
3075         struct btrfs_key key;
3076         int ret, err;
3077
3078         path = btrfs_alloc_path();
3079         if (!path)
3080                 return -ENOMEM;
3081
3082         trans = btrfs_start_transaction(root, 0);
3083         if (IS_ERR(trans)) {
3084                 btrfs_free_path(path);
3085                 return PTR_ERR(trans);
3086         }
3087
3088         key.objectid = BTRFS_BALANCE_OBJECTID;
3089         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3090         key.offset = 0;
3091
3092         ret = btrfs_insert_empty_item(trans, root, path, &key,
3093                                       sizeof(*item));
3094         if (ret)
3095                 goto out;
3096
3097         leaf = path->nodes[0];
3098         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3099
3100         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3101
3102         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3103         btrfs_set_balance_data(leaf, item, &disk_bargs);
3104         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3105         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3106         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3107         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3108
3109         btrfs_set_balance_flags(leaf, item, bctl->flags);
3110
3111         btrfs_mark_buffer_dirty(leaf);
3112 out:
3113         btrfs_free_path(path);
3114         err = btrfs_commit_transaction(trans);
3115         if (err && !ret)
3116                 ret = err;
3117         return ret;
3118 }
3119
3120 static int del_balance_item(struct btrfs_fs_info *fs_info)
3121 {
3122         struct btrfs_root *root = fs_info->tree_root;
3123         struct btrfs_trans_handle *trans;
3124         struct btrfs_path *path;
3125         struct btrfs_key key;
3126         int ret, err;
3127
3128         path = btrfs_alloc_path();
3129         if (!path)
3130                 return -ENOMEM;
3131
3132         trans = btrfs_start_transaction(root, 0);
3133         if (IS_ERR(trans)) {
3134                 btrfs_free_path(path);
3135                 return PTR_ERR(trans);
3136         }
3137
3138         key.objectid = BTRFS_BALANCE_OBJECTID;
3139         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3140         key.offset = 0;
3141
3142         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3143         if (ret < 0)
3144                 goto out;
3145         if (ret > 0) {
3146                 ret = -ENOENT;
3147                 goto out;
3148         }
3149
3150         ret = btrfs_del_item(trans, root, path);
3151 out:
3152         btrfs_free_path(path);
3153         err = btrfs_commit_transaction(trans);
3154         if (err && !ret)
3155                 ret = err;
3156         return ret;
3157 }
3158
3159 /*
3160  * This is a heuristic used to reduce the number of chunks balanced on
3161  * resume after balance was interrupted.
3162  */
3163 static void update_balance_args(struct btrfs_balance_control *bctl)
3164 {
3165         /*
3166          * Turn on soft mode for chunk types that were being converted.
3167          */
3168         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3169                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3170         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3171                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3172         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3173                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3174
3175         /*
3176          * Turn on usage filter if is not already used.  The idea is
3177          * that chunks that we have already balanced should be
3178          * reasonably full.  Don't do it for chunks that are being
3179          * converted - that will keep us from relocating unconverted
3180          * (albeit full) chunks.
3181          */
3182         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3183             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3184             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3185                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3186                 bctl->data.usage = 90;
3187         }
3188         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3189             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3190             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3191                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3192                 bctl->sys.usage = 90;
3193         }
3194         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3195             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3196             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3197                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3198                 bctl->meta.usage = 90;
3199         }
3200 }
3201
3202 /*
3203  * Clear the balance status in fs_info and delete the balance item from disk.
3204  */
3205 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3206 {
3207         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3208         int ret;
3209
3210         BUG_ON(!fs_info->balance_ctl);
3211
3212         spin_lock(&fs_info->balance_lock);
3213         fs_info->balance_ctl = NULL;
3214         spin_unlock(&fs_info->balance_lock);
3215
3216         kfree(bctl);
3217         ret = del_balance_item(fs_info);
3218         if (ret)
3219                 btrfs_handle_fs_error(fs_info, ret, NULL);
3220 }
3221
3222 /*
3223  * Balance filters.  Return 1 if chunk should be filtered out
3224  * (should not be balanced).
3225  */
3226 static int chunk_profiles_filter(u64 chunk_type,
3227                                  struct btrfs_balance_args *bargs)
3228 {
3229         chunk_type = chunk_to_extended(chunk_type) &
3230                                 BTRFS_EXTENDED_PROFILE_MASK;
3231
3232         if (bargs->profiles & chunk_type)
3233                 return 0;
3234
3235         return 1;
3236 }
3237
3238 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3239                               struct btrfs_balance_args *bargs)
3240 {
3241         struct btrfs_block_group_cache *cache;
3242         u64 chunk_used;
3243         u64 user_thresh_min;
3244         u64 user_thresh_max;
3245         int ret = 1;
3246
3247         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3248         chunk_used = btrfs_block_group_used(&cache->item);
3249
3250         if (bargs->usage_min == 0)
3251                 user_thresh_min = 0;
3252         else
3253                 user_thresh_min = div_factor_fine(cache->key.offset,
3254                                         bargs->usage_min);
3255
3256         if (bargs->usage_max == 0)
3257                 user_thresh_max = 1;
3258         else if (bargs->usage_max > 100)
3259                 user_thresh_max = cache->key.offset;
3260         else
3261                 user_thresh_max = div_factor_fine(cache->key.offset,
3262                                         bargs->usage_max);
3263
3264         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3265                 ret = 0;
3266
3267         btrfs_put_block_group(cache);
3268         return ret;
3269 }
3270
3271 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3272                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3273 {
3274         struct btrfs_block_group_cache *cache;
3275         u64 chunk_used, user_thresh;
3276         int ret = 1;
3277
3278         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3279         chunk_used = btrfs_block_group_used(&cache->item);
3280
3281         if (bargs->usage_min == 0)
3282                 user_thresh = 1;
3283         else if (bargs->usage > 100)
3284                 user_thresh = cache->key.offset;
3285         else
3286                 user_thresh = div_factor_fine(cache->key.offset,
3287                                               bargs->usage);
3288
3289         if (chunk_used < user_thresh)
3290                 ret = 0;
3291
3292         btrfs_put_block_group(cache);
3293         return ret;
3294 }
3295
3296 static int chunk_devid_filter(struct extent_buffer *leaf,
3297                               struct btrfs_chunk *chunk,
3298                               struct btrfs_balance_args *bargs)
3299 {
3300         struct btrfs_stripe *stripe;
3301         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3302         int i;
3303
3304         for (i = 0; i < num_stripes; i++) {
3305                 stripe = btrfs_stripe_nr(chunk, i);
3306                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3307                         return 0;
3308         }
3309
3310         return 1;
3311 }
3312
3313 /* [pstart, pend) */
3314 static int chunk_drange_filter(struct extent_buffer *leaf,
3315                                struct btrfs_chunk *chunk,
3316                                struct btrfs_balance_args *bargs)
3317 {
3318         struct btrfs_stripe *stripe;
3319         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3320         u64 stripe_offset;
3321         u64 stripe_length;
3322         int factor;
3323         int i;
3324
3325         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3326                 return 0;
3327
3328         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3329              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3330                 factor = num_stripes / 2;
3331         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3332                 factor = num_stripes - 1;
3333         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3334                 factor = num_stripes - 2;
3335         } else {
3336                 factor = num_stripes;
3337         }
3338
3339         for (i = 0; i < num_stripes; i++) {
3340                 stripe = btrfs_stripe_nr(chunk, i);
3341                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3342                         continue;
3343
3344                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3345                 stripe_length = btrfs_chunk_length(leaf, chunk);
3346                 stripe_length = div_u64(stripe_length, factor);
3347
3348                 if (stripe_offset < bargs->pend &&
3349                     stripe_offset + stripe_length > bargs->pstart)
3350                         return 0;
3351         }
3352
3353         return 1;
3354 }
3355
3356 /* [vstart, vend) */
3357 static int chunk_vrange_filter(struct extent_buffer *leaf,
3358                                struct btrfs_chunk *chunk,
3359                                u64 chunk_offset,
3360                                struct btrfs_balance_args *bargs)
3361 {
3362         if (chunk_offset < bargs->vend &&
3363             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3364                 /* at least part of the chunk is inside this vrange */
3365                 return 0;
3366
3367         return 1;
3368 }
3369
3370 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3371                                struct btrfs_chunk *chunk,
3372                                struct btrfs_balance_args *bargs)
3373 {
3374         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3375
3376         if (bargs->stripes_min <= num_stripes
3377                         && num_stripes <= bargs->stripes_max)
3378                 return 0;
3379
3380         return 1;
3381 }
3382
3383 static int chunk_soft_convert_filter(u64 chunk_type,
3384                                      struct btrfs_balance_args *bargs)
3385 {
3386         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3387                 return 0;
3388
3389         chunk_type = chunk_to_extended(chunk_type) &
3390                                 BTRFS_EXTENDED_PROFILE_MASK;
3391
3392         if (bargs->target == chunk_type)
3393                 return 1;
3394
3395         return 0;
3396 }
3397
3398 static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3399                                 struct extent_buffer *leaf,
3400                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3401 {
3402         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3403         struct btrfs_balance_args *bargs = NULL;
3404         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3405
3406         /* type filter */
3407         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3408               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3409                 return 0;
3410         }
3411
3412         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3413                 bargs = &bctl->data;
3414         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3415                 bargs = &bctl->sys;
3416         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3417                 bargs = &bctl->meta;
3418
3419         /* profiles filter */
3420         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3421             chunk_profiles_filter(chunk_type, bargs)) {
3422                 return 0;
3423         }
3424
3425         /* usage filter */
3426         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3427             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3428                 return 0;
3429         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3430             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3431                 return 0;
3432         }
3433
3434         /* devid filter */
3435         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3436             chunk_devid_filter(leaf, chunk, bargs)) {
3437                 return 0;
3438         }
3439
3440         /* drange filter, makes sense only with devid filter */
3441         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3442             chunk_drange_filter(leaf, chunk, bargs)) {
3443                 return 0;
3444         }
3445
3446         /* vrange filter */
3447         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3448             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3449                 return 0;
3450         }
3451
3452         /* stripes filter */
3453         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3454             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3455                 return 0;
3456         }
3457
3458         /* soft profile changing mode */
3459         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3460             chunk_soft_convert_filter(chunk_type, bargs)) {
3461                 return 0;
3462         }
3463
3464         /*
3465          * limited by count, must be the last filter
3466          */
3467         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3468                 if (bargs->limit == 0)
3469                         return 0;
3470                 else
3471                         bargs->limit--;
3472         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3473                 /*
3474                  * Same logic as the 'limit' filter; the minimum cannot be
3475                  * determined here because we do not have the global information
3476                  * about the count of all chunks that satisfy the filters.
3477                  */
3478                 if (bargs->limit_max == 0)
3479                         return 0;
3480                 else
3481                         bargs->limit_max--;
3482         }
3483
3484         return 1;
3485 }
3486
3487 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3488 {
3489         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3490         struct btrfs_root *chunk_root = fs_info->chunk_root;
3491         struct btrfs_root *dev_root = fs_info->dev_root;
3492         struct list_head *devices;
3493         struct btrfs_device *device;
3494         u64 old_size;
3495         u64 size_to_free;
3496         u64 chunk_type;
3497         struct btrfs_chunk *chunk;
3498         struct btrfs_path *path = NULL;
3499         struct btrfs_key key;
3500         struct btrfs_key found_key;
3501         struct btrfs_trans_handle *trans;
3502         struct extent_buffer *leaf;
3503         int slot;
3504         int ret;
3505         int enospc_errors = 0;
3506         bool counting = true;
3507         /* The single value limit and min/max limits use the same bytes in the */
3508         u64 limit_data = bctl->data.limit;
3509         u64 limit_meta = bctl->meta.limit;
3510         u64 limit_sys = bctl->sys.limit;
3511         u32 count_data = 0;
3512         u32 count_meta = 0;
3513         u32 count_sys = 0;
3514         int chunk_reserved = 0;
3515
3516         /* step one make some room on all the devices */
3517         devices = &fs_info->fs_devices->devices;
3518         list_for_each_entry(device, devices, dev_list) {
3519                 old_size = btrfs_device_get_total_bytes(device);
3520                 size_to_free = div_factor(old_size, 1);
3521                 size_to_free = min_t(u64, size_to_free, SZ_1M);
3522                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3523                     btrfs_device_get_total_bytes(device) -
3524                     btrfs_device_get_bytes_used(device) > size_to_free ||
3525                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3526                         continue;
3527
3528                 ret = btrfs_shrink_device(device, old_size - size_to_free);
3529                 if (ret == -ENOSPC)
3530                         break;
3531                 if (ret) {
3532                         /* btrfs_shrink_device never returns ret > 0 */
3533                         WARN_ON(ret > 0);
3534                         goto error;
3535                 }
3536
3537                 trans = btrfs_start_transaction(dev_root, 0);
3538                 if (IS_ERR(trans)) {
3539                         ret = PTR_ERR(trans);
3540                         btrfs_info_in_rcu(fs_info,
3541                  "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3542                                           rcu_str_deref(device->name), ret,
3543                                           old_size, old_size - size_to_free);
3544                         goto error;
3545                 }
3546
3547                 ret = btrfs_grow_device(trans, device, old_size);
3548                 if (ret) {
3549                         btrfs_end_transaction(trans);
3550                         /* btrfs_grow_device never returns ret > 0 */
3551                         WARN_ON(ret > 0);
3552                         btrfs_info_in_rcu(fs_info,
3553                  "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3554                                           rcu_str_deref(device->name), ret,
3555                                           old_size, old_size - size_to_free);
3556                         goto error;
3557                 }
3558
3559                 btrfs_end_transaction(trans);
3560         }
3561
3562         /* step two, relocate all the chunks */
3563         path = btrfs_alloc_path();
3564         if (!path) {
3565                 ret = -ENOMEM;
3566                 goto error;
3567         }
3568
3569         /* zero out stat counters */
3570         spin_lock(&fs_info->balance_lock);
3571         memset(&bctl->stat, 0, sizeof(bctl->stat));
3572         spin_unlock(&fs_info->balance_lock);
3573 again:
3574         if (!counting) {
3575                 /*
3576                  * The single value limit and min/max limits use the same bytes
3577                  * in the
3578                  */
3579                 bctl->data.limit = limit_data;
3580                 bctl->meta.limit = limit_meta;
3581                 bctl->sys.limit = limit_sys;
3582         }
3583         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3584         key.offset = (u64)-1;
3585         key.type = BTRFS_CHUNK_ITEM_KEY;
3586
3587         while (1) {
3588                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3589                     atomic_read(&fs_info->balance_cancel_req)) {
3590                         ret = -ECANCELED;
3591                         goto error;
3592                 }
3593
3594                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3595                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3596                 if (ret < 0) {
3597                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3598                         goto error;
3599                 }
3600
3601                 /*
3602                  * this shouldn't happen, it means the last relocate
3603                  * failed
3604                  */
3605                 if (ret == 0)
3606                         BUG(); /* FIXME break ? */
3607
3608                 ret = btrfs_previous_item(chunk_root, path, 0,
3609                                           BTRFS_CHUNK_ITEM_KEY);
3610                 if (ret) {
3611                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3612                         ret = 0;
3613                         break;
3614                 }
3615
3616                 leaf = path->nodes[0];
3617                 slot = path->slots[0];
3618                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3619
3620                 if (found_key.objectid != key.objectid) {
3621                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3622                         break;
3623                 }
3624
3625                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3626                 chunk_type = btrfs_chunk_type(leaf, chunk);
3627
3628                 if (!counting) {
3629                         spin_lock(&fs_info->balance_lock);
3630                         bctl->stat.considered++;
3631                         spin_unlock(&fs_info->balance_lock);
3632                 }
3633
3634                 ret = should_balance_chunk(fs_info, leaf, chunk,
3635                                            found_key.offset);
3636
3637                 btrfs_release_path(path);
3638                 if (!ret) {
3639                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3640                         goto loop;
3641                 }
3642
3643                 if (counting) {
3644                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3645                         spin_lock(&fs_info->balance_lock);
3646                         bctl->stat.expected++;
3647                         spin_unlock(&fs_info->balance_lock);
3648
3649                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3650                                 count_data++;
3651                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3652                                 count_sys++;
3653                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3654                                 count_meta++;
3655
3656                         goto loop;
3657                 }
3658
3659                 /*
3660                  * Apply limit_min filter, no need to check if the LIMITS
3661                  * filter is used, limit_min is 0 by default
3662                  */
3663                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3664                                         count_data < bctl->data.limit_min)
3665                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3666                                         count_meta < bctl->meta.limit_min)
3667                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3668                                         count_sys < bctl->sys.limit_min)) {
3669                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3670                         goto loop;
3671                 }
3672
3673                 if (!chunk_reserved) {
3674                         /*
3675                          * We may be relocating the only data chunk we have,
3676                          * which could potentially end up with losing data's
3677                          * raid profile, so lets allocate an empty one in
3678                          * advance.
3679                          */
3680                         ret = btrfs_may_alloc_data_chunk(fs_info,
3681                                                          found_key.offset);
3682                         if (ret < 0) {
3683                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3684                                 goto error;
3685                         } else if (ret == 1) {
3686                                 chunk_reserved = 1;
3687                         }
3688                 }
3689
3690                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3691                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3692                 if (ret == -ENOSPC) {
3693                         enospc_errors++;
3694                 } else if (ret == -ETXTBSY) {
3695                         btrfs_info(fs_info,
3696            "skipping relocation of block group %llu due to active swapfile",
3697                                    found_key.offset);
3698                         ret = 0;
3699                 } else if (ret) {
3700                         goto error;
3701                 } else {
3702                         spin_lock(&fs_info->balance_lock);
3703                         bctl->stat.completed++;
3704                         spin_unlock(&fs_info->balance_lock);
3705                 }
3706 loop:
3707                 if (found_key.offset == 0)
3708                         break;
3709                 key.offset = found_key.offset - 1;
3710         }
3711
3712         if (counting) {
3713                 btrfs_release_path(path);
3714                 counting = false;
3715                 goto again;
3716         }
3717 error:
3718         btrfs_free_path(path);
3719         if (enospc_errors) {
3720                 btrfs_info(fs_info, "%d enospc errors during balance",
3721                            enospc_errors);
3722                 if (!ret)
3723                         ret = -ENOSPC;
3724         }
3725
3726         return ret;
3727 }
3728
3729 /**
3730  * alloc_profile_is_valid - see if a given profile is valid and reduced
3731  * @flags: profile to validate
3732  * @extended: if true @flags is treated as an extended profile
3733  */
3734 static int alloc_profile_is_valid(u64 flags, int extended)
3735 {
3736       &nb