f7191fdfb44cfd29f1a9174287003c843fef5cbd
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include "ctree.h"
18 #include "extent_map.h"
19 #include "disk-io.h"
20 #include "transaction.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "async-thread.h"
25 #include "check-integrity.h"
26 #include "rcu-string.h"
27 #include "math.h"
28 #include "dev-replace.h"
29 #include "sysfs.h"
30
31 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
32         [BTRFS_RAID_RAID10] = {
33                 .sub_stripes    = 2,
34                 .dev_stripes    = 1,
35                 .devs_max       = 0,    /* 0 == as many as possible */
36                 .devs_min       = 4,
37                 .tolerated_failures = 1,
38                 .devs_increment = 2,
39                 .ncopies        = 2,
40                 .raid_name      = "raid10",
41                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
42                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
43         },
44         [BTRFS_RAID_RAID1] = {
45                 .sub_stripes    = 1,
46                 .dev_stripes    = 1,
47                 .devs_max       = 2,
48                 .devs_min       = 2,
49                 .tolerated_failures = 1,
50                 .devs_increment = 2,
51                 .ncopies        = 2,
52                 .raid_name      = "raid1",
53                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
54                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
55         },
56         [BTRFS_RAID_DUP] = {
57                 .sub_stripes    = 1,
58                 .dev_stripes    = 2,
59                 .devs_max       = 1,
60                 .devs_min       = 1,
61                 .tolerated_failures = 0,
62                 .devs_increment = 1,
63                 .ncopies        = 2,
64                 .raid_name      = "dup",
65                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
66                 .mindev_error   = 0,
67         },
68         [BTRFS_RAID_RAID0] = {
69                 .sub_stripes    = 1,
70                 .dev_stripes    = 1,
71                 .devs_max       = 0,
72                 .devs_min       = 2,
73                 .tolerated_failures = 0,
74                 .devs_increment = 1,
75                 .ncopies        = 1,
76                 .raid_name      = "raid0",
77                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
78                 .mindev_error   = 0,
79         },
80         [BTRFS_RAID_SINGLE] = {
81                 .sub_stripes    = 1,
82                 .dev_stripes    = 1,
83                 .devs_max       = 1,
84                 .devs_min       = 1,
85                 .tolerated_failures = 0,
86                 .devs_increment = 1,
87                 .ncopies        = 1,
88                 .raid_name      = "single",
89                 .bg_flag        = 0,
90                 .mindev_error   = 0,
91         },
92         [BTRFS_RAID_RAID5] = {
93                 .sub_stripes    = 1,
94                 .dev_stripes    = 1,
95                 .devs_max       = 0,
96                 .devs_min       = 2,
97                 .tolerated_failures = 1,
98                 .devs_increment = 1,
99                 .ncopies        = 2,
100                 .raid_name      = "raid5",
101                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
102                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
103         },
104         [BTRFS_RAID_RAID6] = {
105                 .sub_stripes    = 1,
106                 .dev_stripes    = 1,
107                 .devs_max       = 0,
108                 .devs_min       = 3,
109                 .tolerated_failures = 2,
110                 .devs_increment = 1,
111                 .ncopies        = 3,
112                 .raid_name      = "raid6",
113                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
114                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
115         },
116 };
117
118 const char *get_raid_name(enum btrfs_raid_types type)
119 {
120         if (type >= BTRFS_NR_RAID_TYPES)
121                 return NULL;
122
123         return btrfs_raid_array[type].raid_name;
124 }
125
126 static int init_first_rw_device(struct btrfs_trans_handle *trans,
127                                 struct btrfs_fs_info *fs_info);
128 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
129 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
130 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
131 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
132 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
133                              enum btrfs_map_op op,
134                              u64 logical, u64 *length,
135                              struct btrfs_bio **bbio_ret,
136                              int mirror_num, int need_raid_map);
137
138 /*
139  * Device locking
140  * ==============
141  *
142  * There are several mutexes that protect manipulation of devices and low-level
143  * structures like chunks but not block groups, extents or files
144  *
145  * uuid_mutex (global lock)
146  * ------------------------
147  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
148  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
149  * device) or requested by the device= mount option
150  *
151  * the mutex can be very coarse and can cover long-running operations
152  *
153  * protects: updates to fs_devices counters like missing devices, rw devices,
154  * seeding, structure cloning, openning/closing devices at mount/umount time
155  *
156  * global::fs_devs - add, remove, updates to the global list
157  *
158  * does not protect: manipulation of the fs_devices::devices list!
159  *
160  * btrfs_device::name - renames (write side), read is RCU
161  *
162  * fs_devices::device_list_mutex (per-fs, with RCU)
163  * ------------------------------------------------
164  * protects updates to fs_devices::devices, ie. adding and deleting
165  *
166  * simple list traversal with read-only actions can be done with RCU protection
167  *
168  * may be used to exclude some operations from running concurrently without any
169  * modifications to the list (see write_all_supers)
170  *
171  * balance_mutex
172  * -------------
173  * protects balance structures (status, state) and context accessed from
174  * several places (internally, ioctl)
175  *
176  * chunk_mutex
177  * -----------
178  * protects chunks, adding or removing during allocation, trim or when a new
179  * device is added/removed
180  *
181  * cleaner_mutex
182  * -------------
183  * a big lock that is held by the cleaner thread and prevents running subvolume
184  * cleaning together with relocation or delayed iputs
185  *
186  *
187  * Lock nesting
188  * ============
189  *
190  * uuid_mutex
191  *   volume_mutex
192  *     device_list_mutex
193  *       chunk_mutex
194  *     balance_mutex
195  *
196  *
197  * Exclusive operations, BTRFS_FS_EXCL_OP
198  * ======================================
199  *
200  * Maintains the exclusivity of the following operations that apply to the
201  * whole filesystem and cannot run in parallel.
202  *
203  * - Balance (*)
204  * - Device add
205  * - Device remove
206  * - Device replace (*)
207  * - Resize
208  *
209  * The device operations (as above) can be in one of the following states:
210  *
211  * - Running state
212  * - Paused state
213  * - Completed state
214  *
215  * Only device operations marked with (*) can go into the Paused state for the
216  * following reasons:
217  *
218  * - ioctl (only Balance can be Paused through ioctl)
219  * - filesystem remounted as read-only
220  * - filesystem unmounted and mounted as read-only
221  * - system power-cycle and filesystem mounted as read-only
222  * - filesystem or device errors leading to forced read-only
223  *
224  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
225  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
226  * A device operation in Paused or Running state can be canceled or resumed
227  * either by ioctl (Balance only) or when remounted as read-write.
228  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
229  * completed.
230  */
231
232 DEFINE_MUTEX(uuid_mutex);
233 static LIST_HEAD(fs_uuids);
234 struct list_head *btrfs_get_fs_uuids(void)
235 {
236         return &fs_uuids;
237 }
238
239 /*
240  * alloc_fs_devices - allocate struct btrfs_fs_devices
241  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
242  *
243  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
244  * The returned struct is not linked onto any lists and can be destroyed with
245  * kfree() right away.
246  */
247 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
248 {
249         struct btrfs_fs_devices *fs_devs;
250
251         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
252         if (!fs_devs)
253                 return ERR_PTR(-ENOMEM);
254
255         mutex_init(&fs_devs->device_list_mutex);
256
257         INIT_LIST_HEAD(&fs_devs->devices);
258         INIT_LIST_HEAD(&fs_devs->resized_devices);
259         INIT_LIST_HEAD(&fs_devs->alloc_list);
260         INIT_LIST_HEAD(&fs_devs->fs_list);
261         if (fsid)
262                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
263
264         return fs_devs;
265 }
266
267 void btrfs_free_device(struct btrfs_device *device)
268 {
269         rcu_string_free(device->name);
270         bio_put(device->flush_bio);
271         kfree(device);
272 }
273
274 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
275 {
276         struct btrfs_device *device;
277         WARN_ON(fs_devices->opened);
278         while (!list_empty(&fs_devices->devices)) {
279                 device = list_entry(fs_devices->devices.next,
280                                     struct btrfs_device, dev_list);
281                 list_del(&device->dev_list);
282                 btrfs_free_device(device);
283         }
284         kfree(fs_devices);
285 }
286
287 static void btrfs_kobject_uevent(struct block_device *bdev,
288                                  enum kobject_action action)
289 {
290         int ret;
291
292         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
293         if (ret)
294                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
295                         action,
296                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
297                         &disk_to_dev(bdev->bd_disk)->kobj);
298 }
299
300 void __exit btrfs_cleanup_fs_uuids(void)
301 {
302         struct btrfs_fs_devices *fs_devices;
303
304         while (!list_empty(&fs_uuids)) {
305                 fs_devices = list_entry(fs_uuids.next,
306                                         struct btrfs_fs_devices, fs_list);
307                 list_del(&fs_devices->fs_list);
308                 free_fs_devices(fs_devices);
309         }
310 }
311
312 /*
313  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
314  * Returned struct is not linked onto any lists and must be destroyed using
315  * btrfs_free_device.
316  */
317 static struct btrfs_device *__alloc_device(void)
318 {
319         struct btrfs_device *dev;
320
321         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
322         if (!dev)
323                 return ERR_PTR(-ENOMEM);
324
325         /*
326          * Preallocate a bio that's always going to be used for flushing device
327          * barriers and matches the device lifespan
328          */
329         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
330         if (!dev->flush_bio) {
331                 kfree(dev);
332                 return ERR_PTR(-ENOMEM);
333         }
334
335         INIT_LIST_HEAD(&dev->dev_list);
336         INIT_LIST_HEAD(&dev->dev_alloc_list);
337         INIT_LIST_HEAD(&dev->resized_list);
338
339         spin_lock_init(&dev->io_lock);
340
341         atomic_set(&dev->reada_in_flight, 0);
342         atomic_set(&dev->dev_stats_ccnt, 0);
343         btrfs_device_data_ordered_init(dev);
344         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
345         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
346
347         return dev;
348 }
349
350 /*
351  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
352  * return NULL.
353  *
354  * If devid and uuid are both specified, the match must be exact, otherwise
355  * only devid is used.
356  */
357 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
358                 u64 devid, const u8 *uuid)
359 {
360         struct btrfs_device *dev;
361
362         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
363                 if (dev->devid == devid &&
364                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
365                         return dev;
366                 }
367         }
368         return NULL;
369 }
370
371 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
372 {
373         struct btrfs_fs_devices *fs_devices;
374
375         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
376                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
377                         return fs_devices;
378         }
379         return NULL;
380 }
381
382 static int
383 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
384                       int flush, struct block_device **bdev,
385                       struct buffer_head **bh)
386 {
387         int ret;
388
389         *bdev = blkdev_get_by_path(device_path, flags, holder);
390
391         if (IS_ERR(*bdev)) {
392                 ret = PTR_ERR(*bdev);
393                 goto error;
394         }
395
396         if (flush)
397                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
398         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
399         if (ret) {
400                 blkdev_put(*bdev, flags);
401                 goto error;
402         }
403         invalidate_bdev(*bdev);
404         *bh = btrfs_read_dev_super(*bdev);
405         if (IS_ERR(*bh)) {
406                 ret = PTR_ERR(*bh);
407                 blkdev_put(*bdev, flags);
408                 goto error;
409         }
410
411         return 0;
412
413 error:
414         *bdev = NULL;
415         *bh = NULL;
416         return ret;
417 }
418
419 static void requeue_list(struct btrfs_pending_bios *pending_bios,
420                         struct bio *head, struct bio *tail)
421 {
422
423         struct bio *old_head;
424
425         old_head = pending_bios->head;
426         pending_bios->head = head;
427         if (pending_bios->tail)
428                 tail->bi_next = old_head;
429         else
430                 pending_bios->tail = tail;
431 }
432
433 /*
434  * we try to collect pending bios for a device so we don't get a large
435  * number of procs sending bios down to the same device.  This greatly
436  * improves the schedulers ability to collect and merge the bios.
437  *
438  * But, it also turns into a long list of bios to process and that is sure
439  * to eventually make the worker thread block.  The solution here is to
440  * make some progress and then put this work struct back at the end of
441  * the list if the block device is congested.  This way, multiple devices
442  * can make progress from a single worker thread.
443  */
444 static noinline void run_scheduled_bios(struct btrfs_device *device)
445 {
446         struct btrfs_fs_info *fs_info = device->fs_info;
447         struct bio *pending;
448         struct backing_dev_info *bdi;
449         struct btrfs_pending_bios *pending_bios;
450         struct bio *tail;
451         struct bio *cur;
452         int again = 0;
453         unsigned long num_run;
454         unsigned long batch_run = 0;
455         unsigned long last_waited = 0;
456         int force_reg = 0;
457         int sync_pending = 0;
458         struct blk_plug plug;
459
460         /*
461          * this function runs all the bios we've collected for
462          * a particular device.  We don't want to wander off to
463          * another device without first sending all of these down.
464          * So, setup a plug here and finish it off before we return
465          */
466         blk_start_plug(&plug);
467
468         bdi = device->bdev->bd_bdi;
469
470 loop:
471         spin_lock(&device->io_lock);
472
473 loop_lock:
474         num_run = 0;
475
476         /* take all the bios off the list at once and process them
477          * later on (without the lock held).  But, remember the
478          * tail and other pointers so the bios can be properly reinserted
479          * into the list if we hit congestion
480          */
481         if (!force_reg && device->pending_sync_bios.head) {
482                 pending_bios = &device->pending_sync_bios;
483                 force_reg = 1;
484         } else {
485                 pending_bios = &device->pending_bios;
486                 force_reg = 0;
487         }
488
489         pending = pending_bios->head;
490         tail = pending_bios->tail;
491         WARN_ON(pending && !tail);
492
493         /*
494          * if pending was null this time around, no bios need processing
495          * at all and we can stop.  Otherwise it'll loop back up again
496          * and do an additional check so no bios are missed.
497          *
498          * device->running_pending is used to synchronize with the
499          * schedule_bio code.
500          */
501         if (device->pending_sync_bios.head == NULL &&
502             device->pending_bios.head == NULL) {
503                 again = 0;
504                 device->running_pending = 0;
505         } else {
506                 again = 1;
507                 device->running_pending = 1;
508         }
509
510         pending_bios->head = NULL;
511         pending_bios->tail = NULL;
512
513         spin_unlock(&device->io_lock);
514
515         while (pending) {
516
517                 rmb();
518                 /* we want to work on both lists, but do more bios on the
519                  * sync list than the regular list
520                  */
521                 if ((num_run > 32 &&
522                     pending_bios != &device->pending_sync_bios &&
523                     device->pending_sync_bios.head) ||
524                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
525                     device->pending_bios.head)) {
526                         spin_lock(&device->io_lock);
527                         requeue_list(pending_bios, pending, tail);
528                         goto loop_lock;
529                 }
530
531                 cur = pending;
532                 pending = pending->bi_next;
533                 cur->bi_next = NULL;
534
535                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
536
537                 /*
538                  * if we're doing the sync list, record that our
539                  * plug has some sync requests on it
540                  *
541                  * If we're doing the regular list and there are
542                  * sync requests sitting around, unplug before
543                  * we add more
544                  */
545                 if (pending_bios == &device->pending_sync_bios) {
546                         sync_pending = 1;
547                 } else if (sync_pending) {
548                         blk_finish_plug(&plug);
549                         blk_start_plug(&plug);
550                         sync_pending = 0;
551                 }
552
553                 btrfsic_submit_bio(cur);
554                 num_run++;
555                 batch_run++;
556
557                 cond_resched();
558
559                 /*
560                  * we made progress, there is more work to do and the bdi
561                  * is now congested.  Back off and let other work structs
562                  * run instead
563                  */
564                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
565                     fs_info->fs_devices->open_devices > 1) {
566                         struct io_context *ioc;
567
568                         ioc = current->io_context;
569
570                         /*
571                          * the main goal here is that we don't want to
572                          * block if we're going to be able to submit
573                          * more requests without blocking.
574                          *
575                          * This code does two great things, it pokes into
576                          * the elevator code from a filesystem _and_
577                          * it makes assumptions about how batching works.
578                          */
579                         if (ioc && ioc->nr_batch_requests > 0 &&
580                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
581                             (last_waited == 0 ||
582                              ioc->last_waited == last_waited)) {
583                                 /*
584                                  * we want to go through our batch of
585                                  * requests and stop.  So, we copy out
586                                  * the ioc->last_waited time and test
587                                  * against it before looping
588                                  */
589                                 last_waited = ioc->last_waited;
590                                 cond_resched();
591                                 continue;
592                         }
593                         spin_lock(&device->io_lock);
594                         requeue_list(pending_bios, pending, tail);
595                         device->running_pending = 1;
596
597                         spin_unlock(&device->io_lock);
598                         btrfs_queue_work(fs_info->submit_workers,
599                                          &device->work);
600                         goto done;
601                 }
602         }
603
604         cond_resched();
605         if (again)
606                 goto loop;
607
608         spin_lock(&device->io_lock);
609         if (device->pending_bios.head || device->pending_sync_bios.head)
610                 goto loop_lock;
611         spin_unlock(&device->io_lock);
612
613 done:
614         blk_finish_plug(&plug);
615 }
616
617 static void pending_bios_fn(struct btrfs_work *work)
618 {
619         struct btrfs_device *device;
620
621         device = container_of(work, struct btrfs_device, work);
622         run_scheduled_bios(device);
623 }
624
625 /*
626  *  Search and remove all stale (devices which are not mounted) devices.
627  *  When both inputs are NULL, it will search and release all stale devices.
628  *  path:       Optional. When provided will it release all unmounted devices
629  *              matching this path only.
630  *  skip_dev:   Optional. Will skip this device when searching for the stale
631  *              devices.
632  */
633 static void btrfs_free_stale_devices(const char *path,
634                                      struct btrfs_device *skip_dev)
635 {
636         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
637         struct btrfs_device *dev, *tmp_dev;
638
639         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
640
641                 if (fs_devs->opened)
642                         continue;
643
644                 list_for_each_entry_safe(dev, tmp_dev,
645                                          &fs_devs->devices, dev_list) {
646                         int not_found = 0;
647
648                         if (skip_dev && skip_dev == dev)
649                                 continue;
650                         if (path && !dev->name)
651                                 continue;
652
653                         rcu_read_lock();
654                         if (path)
655                                 not_found = strcmp(rcu_str_deref(dev->name),
656                                                    path);
657                         rcu_read_unlock();
658                         if (not_found)
659                                 continue;
660
661                         /* delete the stale device */
662                         if (fs_devs->num_devices == 1) {
663                                 btrfs_sysfs_remove_fsid(fs_devs);
664                                 list_del(&fs_devs->fs_list);
665                                 free_fs_devices(fs_devs);
666                                 break;
667                         } else {
668                                 fs_devs->num_devices--;
669                                 list_del(&dev->dev_list);
670                                 btrfs_free_device(dev);
671                         }
672                 }
673         }
674 }
675
676 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
677                         struct btrfs_device *device, fmode_t flags,
678                         void *holder)
679 {
680         struct request_queue *q;
681         struct block_device *bdev;
682         struct buffer_head *bh;
683         struct btrfs_super_block *disk_super;
684         u64 devid;
685         int ret;
686
687         if (device->bdev)
688                 return -EINVAL;
689         if (!device->name)
690                 return -EINVAL;
691
692         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
693                                     &bdev, &bh);
694         if (ret)
695                 return ret;
696
697         disk_super = (struct btrfs_super_block *)bh->b_data;
698         devid = btrfs_stack_device_id(&disk_super->dev_item);
699         if (devid != device->devid)
700                 goto error_brelse;
701
702         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
703                 goto error_brelse;
704
705         device->generation = btrfs_super_generation(disk_super);
706
707         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
708                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
709                 fs_devices->seeding = 1;
710         } else {
711                 if (bdev_read_only(bdev))
712                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
713                 else
714                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
715         }
716
717         q = bdev_get_queue(bdev);
718         if (!blk_queue_nonrot(q))
719                 fs_devices->rotating = 1;
720
721         device->bdev = bdev;
722         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
723         device->mode = flags;
724
725         fs_devices->open_devices++;
726         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
727             device->devid != BTRFS_DEV_REPLACE_DEVID) {
728                 fs_devices->rw_devices++;
729                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
730         }
731         brelse(bh);
732
733         return 0;
734
735 error_brelse:
736         brelse(bh);
737         blkdev_put(bdev, flags);
738
739         return -EINVAL;
740 }
741
742 /*
743  * Add new device to list of registered devices
744  *
745  * Returns:
746  * device pointer which was just added or updated when successful
747  * error pointer when failed
748  */
749 static noinline struct btrfs_device *device_list_add(const char *path,
750                            struct btrfs_super_block *disk_super)
751 {
752         struct btrfs_device *device;
753         struct btrfs_fs_devices *fs_devices;
754         struct rcu_string *name;
755         u64 found_transid = btrfs_super_generation(disk_super);
756         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
757
758         fs_devices = find_fsid(disk_super->fsid);
759         if (!fs_devices) {
760                 fs_devices = alloc_fs_devices(disk_super->fsid);
761                 if (IS_ERR(fs_devices))
762                         return ERR_CAST(fs_devices);
763
764                 list_add(&fs_devices->fs_list, &fs_uuids);
765
766                 device = NULL;
767         } else {
768                 device = find_device(fs_devices, devid,
769                                 disk_super->dev_item.uuid);
770         }
771
772         if (!device) {
773                 if (fs_devices->opened)
774                         return ERR_PTR(-EBUSY);
775
776                 device = btrfs_alloc_device(NULL, &devid,
777                                             disk_super->dev_item.uuid);
778                 if (IS_ERR(device)) {
779                         /* we can safely leave the fs_devices entry around */
780                         return device;
781                 }
782
783                 name = rcu_string_strdup(path, GFP_NOFS);
784                 if (!name) {
785                         btrfs_free_device(device);
786                         return ERR_PTR(-ENOMEM);
787                 }
788                 rcu_assign_pointer(device->name, name);
789
790                 mutex_lock(&fs_devices->device_list_mutex);
791                 list_add_rcu(&device->dev_list, &fs_devices->devices);
792                 fs_devices->num_devices++;
793                 mutex_unlock(&fs_devices->device_list_mutex);
794
795                 device->fs_devices = fs_devices;
796                 btrfs_free_stale_devices(path, device);
797
798                 if (disk_super->label[0])
799                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
800                                 disk_super->label, devid, found_transid, path);
801                 else
802                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
803                                 disk_super->fsid, devid, found_transid, path);
804
805         } else if (!device->name || strcmp(device->name->str, path)) {
806                 /*
807                  * When FS is already mounted.
808                  * 1. If you are here and if the device->name is NULL that
809                  *    means this device was missing at time of FS mount.
810                  * 2. If you are here and if the device->name is different
811                  *    from 'path' that means either
812                  *      a. The same device disappeared and reappeared with
813                  *         different name. or
814                  *      b. The missing-disk-which-was-replaced, has
815                  *         reappeared now.
816                  *
817                  * We must allow 1 and 2a above. But 2b would be a spurious
818                  * and unintentional.
819                  *
820                  * Further in case of 1 and 2a above, the disk at 'path'
821                  * would have missed some transaction when it was away and
822                  * in case of 2a the stale bdev has to be updated as well.
823                  * 2b must not be allowed at all time.
824                  */
825
826                 /*
827                  * For now, we do allow update to btrfs_fs_device through the
828                  * btrfs dev scan cli after FS has been mounted.  We're still
829                  * tracking a problem where systems fail mount by subvolume id
830                  * when we reject replacement on a mounted FS.
831                  */
832                 if (!fs_devices->opened && found_transid < device->generation) {
833                         /*
834                          * That is if the FS is _not_ mounted and if you
835                          * are here, that means there is more than one
836                          * disk with same uuid and devid.We keep the one
837                          * with larger generation number or the last-in if
838                          * generation are equal.
839                          */
840                         return ERR_PTR(-EEXIST);
841                 }
842
843                 name = rcu_string_strdup(path, GFP_NOFS);
844                 if (!name)
845                         return ERR_PTR(-ENOMEM);
846                 rcu_string_free(device->name);
847                 rcu_assign_pointer(device->name, name);
848                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
849                         fs_devices->missing_devices--;
850                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
851                 }
852         }
853
854         /*
855          * Unmount does not free the btrfs_device struct but would zero
856          * generation along with most of the other members. So just update
857          * it back. We need it to pick the disk with largest generation
858          * (as above).
859          */
860         if (!fs_devices->opened)
861                 device->generation = found_transid;
862
863         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
864
865         return device;
866 }
867
868 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
869 {
870         struct btrfs_fs_devices *fs_devices;
871         struct btrfs_device *device;
872         struct btrfs_device *orig_dev;
873
874         fs_devices = alloc_fs_devices(orig->fsid);
875         if (IS_ERR(fs_devices))
876                 return fs_devices;
877
878         mutex_lock(&orig->device_list_mutex);
879         fs_devices->total_devices = orig->total_devices;
880
881         /* We have held the volume lock, it is safe to get the devices. */
882         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
883                 struct rcu_string *name;
884
885                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
886                                             orig_dev->uuid);
887                 if (IS_ERR(device))
888                         goto error;
889
890                 /*
891                  * This is ok to do without rcu read locked because we hold the
892                  * uuid mutex so nothing we touch in here is going to disappear.
893                  */
894                 if (orig_dev->name) {
895                         name = rcu_string_strdup(orig_dev->name->str,
896                                         GFP_KERNEL);
897                         if (!name) {
898                                 btrfs_free_device(device);
899                                 goto error;
900                         }
901                         rcu_assign_pointer(device->name, name);
902                 }
903
904                 list_add(&device->dev_list, &fs_devices->devices);
905                 device->fs_devices = fs_devices;
906                 fs_devices->num_devices++;
907         }
908         mutex_unlock(&orig->device_list_mutex);
909         return fs_devices;
910 error:
911         mutex_unlock(&orig->device_list_mutex);
912         free_fs_devices(fs_devices);
913         return ERR_PTR(-ENOMEM);
914 }
915
916 /*
917  * After we have read the system tree and know devids belonging to
918  * this filesystem, remove the device which does not belong there.
919  */
920 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
921 {
922         struct btrfs_device *device, *next;
923         struct btrfs_device *latest_dev = NULL;
924
925         mutex_lock(&uuid_mutex);
926 again:
927         /* This is the initialized path, it is safe to release the devices. */
928         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
929                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
930                                                         &device->dev_state)) {
931                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
932                              &device->dev_state) &&
933                              (!latest_dev ||
934                               device->generation > latest_dev->generation)) {
935                                 latest_dev = device;
936                         }
937                         continue;
938                 }
939
940                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
941                         /*
942                          * In the first step, keep the device which has
943                          * the correct fsid and the devid that is used
944                          * for the dev_replace procedure.
945                          * In the second step, the dev_replace state is
946                          * read from the device tree and it is known
947                          * whether the procedure is really active or
948                          * not, which means whether this device is
949                          * used or whether it should be removed.
950                          */
951                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
952                                                   &device->dev_state)) {
953                                 continue;
954                         }
955                 }
956                 if (device->bdev) {
957                         blkdev_put(device->bdev, device->mode);
958                         device->bdev = NULL;
959                         fs_devices->open_devices--;
960                 }
961                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
962                         list_del_init(&device->dev_alloc_list);
963                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
964                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
965                                       &device->dev_state))
966                                 fs_devices->rw_devices--;
967                 }
968                 list_del_init(&device->dev_list);
969                 fs_devices->num_devices--;
970                 btrfs_free_device(device);
971         }
972
973         if (fs_devices->seed) {
974                 fs_devices = fs_devices->seed;
975                 goto again;
976         }
977
978         fs_devices->latest_bdev = latest_dev->bdev;
979
980         mutex_unlock(&uuid_mutex);
981 }
982
983 static void free_device_rcu(struct rcu_head *head)
984 {
985         struct btrfs_device *device;
986
987         device = container_of(head, struct btrfs_device, rcu);
988         btrfs_free_device(device);
989 }
990
991 static void btrfs_close_bdev(struct btrfs_device *device)
992 {
993         if (!device->bdev)
994                 return;
995
996         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
997                 sync_blockdev(device->bdev);
998                 invalidate_bdev(device->bdev);
999         }
1000
1001         blkdev_put(device->bdev, device->mode);
1002 }
1003
1004 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1005 {
1006         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1007         struct btrfs_device *new_device;
1008         struct rcu_string *name;
1009
1010         if (device->bdev)
1011                 fs_devices->open_devices--;
1012
1013         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1014             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1015                 list_del_init(&device->dev_alloc_list);
1016                 fs_devices->rw_devices--;
1017         }
1018
1019         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1020                 fs_devices->missing_devices--;
1021
1022         new_device = btrfs_alloc_device(NULL, &device->devid,
1023                                         device->uuid);
1024         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1025
1026         /* Safe because we are under uuid_mutex */
1027         if (device->name) {
1028                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1029                 BUG_ON(!name); /* -ENOMEM */
1030                 rcu_assign_pointer(new_device->name, name);
1031         }
1032
1033         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1034         new_device->fs_devices = device->fs_devices;
1035 }
1036
1037 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1038 {
1039         struct btrfs_device *device, *tmp;
1040         struct list_head pending_put;
1041
1042         INIT_LIST_HEAD(&pending_put);
1043
1044         if (--fs_devices->opened > 0)
1045                 return 0;
1046
1047         mutex_lock(&fs_devices->device_list_mutex);
1048         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1049                 btrfs_prepare_close_one_device(device);
1050                 list_add(&device->dev_list, &pending_put);
1051         }
1052         mutex_unlock(&fs_devices->device_list_mutex);
1053
1054         /*
1055          * btrfs_show_devname() is using the device_list_mutex,
1056          * sometimes call to blkdev_put() leads vfs calling
1057          * into this func. So do put outside of device_list_mutex,
1058          * as of now.
1059          */
1060         while (!list_empty(&pending_put)) {
1061                 device = list_first_entry(&pending_put,
1062                                 struct btrfs_device, dev_list);
1063                 list_del(&device->dev_list);
1064                 btrfs_close_bdev(device);
1065                 call_rcu(&device->rcu, free_device_rcu);
1066         }
1067
1068         WARN_ON(fs_devices->open_devices);
1069         WARN_ON(fs_devices->rw_devices);
1070         fs_devices->opened = 0;
1071         fs_devices->seeding = 0;
1072
1073         return 0;
1074 }
1075
1076 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1077 {
1078         struct btrfs_fs_devices *seed_devices = NULL;
1079         int ret;
1080
1081         mutex_lock(&uuid_mutex);
1082         ret = close_fs_devices(fs_devices);
1083         if (!fs_devices->opened) {
1084                 seed_devices = fs_devices->seed;
1085                 fs_devices->seed = NULL;
1086         }
1087         mutex_unlock(&uuid_mutex);
1088
1089         while (seed_devices) {
1090                 fs_devices = seed_devices;
1091                 seed_devices = fs_devices->seed;
1092                 close_fs_devices(fs_devices);
1093                 free_fs_devices(fs_devices);
1094         }
1095         return ret;
1096 }
1097
1098 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1099                                 fmode_t flags, void *holder)
1100 {
1101         struct btrfs_device *device;
1102         struct btrfs_device *latest_dev = NULL;
1103         int ret = 0;
1104
1105         flags |= FMODE_EXCL;
1106
1107         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1108                 /* Just open everything we can; ignore failures here */
1109                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1110                         continue;
1111
1112                 if (!latest_dev ||
1113                     device->generation > latest_dev->generation)
1114                         latest_dev = device;
1115         }
1116         if (fs_devices->open_devices == 0) {
1117                 ret = -EINVAL;
1118                 goto out;
1119         }
1120         fs_devices->opened = 1;
1121         fs_devices->latest_bdev = latest_dev->bdev;
1122         fs_devices->total_rw_bytes = 0;
1123 out:
1124         return ret;
1125 }
1126
1127 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1128 {
1129         struct btrfs_device *dev1, *dev2;
1130
1131         dev1 = list_entry(a, struct btrfs_device, dev_list);
1132         dev2 = list_entry(b, struct btrfs_device, dev_list);
1133
1134         if (dev1->devid < dev2->devid)
1135                 return -1;
1136         else if (dev1->devid > dev2->devid)
1137                 return 1;
1138         return 0;
1139 }
1140
1141 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1142                        fmode_t flags, void *holder)
1143 {
1144         int ret;
1145
1146         mutex_lock(&uuid_mutex);
1147         mutex_lock(&fs_devices->device_list_mutex);
1148         if (fs_devices->opened) {
1149                 fs_devices->opened++;
1150                 ret = 0;
1151         } else {
1152                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1153                 ret = open_fs_devices(fs_devices, flags, holder);
1154         }
1155         mutex_unlock(&fs_devices->device_list_mutex);
1156         mutex_unlock(&uuid_mutex);
1157
1158         return ret;
1159 }
1160
1161 static void btrfs_release_disk_super(struct page *page)
1162 {
1163         kunmap(page);
1164         put_page(page);
1165 }
1166
1167 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1168                                  struct page **page,
1169                                  struct btrfs_super_block **disk_super)
1170 {
1171         void *p;
1172         pgoff_t index;
1173
1174         /* make sure our super fits in the device */
1175         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1176                 return 1;
1177
1178         /* make sure our super fits in the page */
1179         if (sizeof(**disk_super) > PAGE_SIZE)
1180                 return 1;
1181
1182         /* make sure our super doesn't straddle pages on disk */
1183         index = bytenr >> PAGE_SHIFT;
1184         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1185                 return 1;
1186
1187         /* pull in the page with our super */
1188         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1189                                    index, GFP_KERNEL);
1190
1191         if (IS_ERR_OR_NULL(*page))
1192                 return 1;
1193
1194         p = kmap(*page);
1195
1196         /* align our pointer to the offset of the super block */
1197         *disk_super = p + (bytenr & ~PAGE_MASK);
1198
1199         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1200             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1201                 btrfs_release_disk_super(*page);
1202                 return 1;
1203         }
1204
1205         if ((*disk_super)->label[0] &&
1206                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1207                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1208
1209         return 0;
1210 }
1211
1212 /*
1213  * Look for a btrfs signature on a device. This may be called out of the mount path
1214  * and we are not allowed to call set_blocksize during the scan. The superblock
1215  * is read via pagecache
1216  */
1217 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1218                           struct btrfs_fs_devices **fs_devices_ret)
1219 {
1220         struct btrfs_super_block *disk_super;
1221         struct btrfs_device *device;
1222         struct block_device *bdev;
1223         struct page *page;
1224         int ret = 0;
1225         u64 bytenr;
1226
1227         /*
1228          * we would like to check all the supers, but that would make
1229          * a btrfs mount succeed after a mkfs from a different FS.
1230          * So, we need to add a special mount option to scan for
1231          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1232          */
1233         bytenr = btrfs_sb_offset(0);
1234         flags |= FMODE_EXCL;
1235
1236         bdev = blkdev_get_by_path(path, flags, holder);
1237         if (IS_ERR(bdev))
1238                 return PTR_ERR(bdev);
1239
1240         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1241                 ret = -EINVAL;
1242                 goto error_bdev_put;
1243         }
1244
1245         mutex_lock(&uuid_mutex);
1246         device = device_list_add(path, disk_super);
1247         if (IS_ERR(device))
1248                 ret = PTR_ERR(device);
1249         else
1250                 *fs_devices_ret = device->fs_devices;
1251         mutex_unlock(&uuid_mutex);
1252
1253         btrfs_release_disk_super(page);
1254
1255 error_bdev_put:
1256         blkdev_put(bdev, flags);
1257
1258         return ret;
1259 }
1260
1261 static int contains_pending_extent(struct btrfs_transaction *transaction,
1262                                    struct btrfs_device *device,
1263                                    u64 *start, u64 len)
1264 {
1265         struct btrfs_fs_info *fs_info = device->fs_info;
1266         struct extent_map *em;
1267         struct list_head *search_list = &fs_info->pinned_chunks;
1268         int ret = 0;
1269         u64 physical_start = *start;
1270
1271         if (transaction)
1272                 search_list = &transaction->pending_chunks;
1273 again:
1274         list_for_each_entry(em, search_list, list) {
1275                 struct map_lookup *map;
1276                 int i;
1277
1278                 map = em->map_lookup;
1279                 for (i = 0; i < map->num_stripes; i++) {
1280                         u64 end;
1281
1282                         if (map->stripes[i].dev != device)
1283                                 continue;
1284                         if (map->stripes[i].physical >= physical_start + len ||
1285                             map->stripes[i].physical + em->orig_block_len <=
1286                             physical_start)
1287                                 continue;
1288                         /*
1289                          * Make sure that while processing the pinned list we do
1290                          * not override our *start with a lower value, because
1291                          * we can have pinned chunks that fall within this
1292                          * device hole and that have lower physical addresses
1293                          * than the pending chunks we processed before. If we
1294                          * do not take this special care we can end up getting
1295                          * 2 pending chunks that start at the same physical
1296                          * device offsets because the end offset of a pinned
1297                          * chunk can be equal to the start offset of some
1298                          * pending chunk.
1299                          */
1300                         end = map->stripes[i].physical + em->orig_block_len;
1301                         if (end > *start) {
1302                                 *start = end;
1303                                 ret = 1;
1304                         }
1305                 }
1306         }
1307         if (search_list != &fs_info->pinned_chunks) {
1308                 search_list = &fs_info->pinned_chunks;
1309                 goto again;
1310         }
1311
1312         return ret;
1313 }
1314
1315
1316 /*
1317  * find_free_dev_extent_start - find free space in the specified device
1318  * @device:       the device which we search the free space in
1319  * @num_bytes:    the size of the free space that we need
1320  * @search_start: the position from which to begin the search
1321  * @start:        store the start of the free space.
1322  * @len:          the size of the free space. that we find, or the size
1323  *                of the max free space if we don't find suitable free space
1324  *
1325  * this uses a pretty simple search, the expectation is that it is
1326  * called very infrequently and that a given device has a small number
1327  * of extents
1328  *
1329  * @start is used to store the start of the free space if we find. But if we
1330  * don't find suitable free space, it will be used to store the start position
1331  * of the max free space.
1332  *
1333  * @len is used to store the size of the free space that we find.
1334  * But if we don't find suitable free space, it is used to store the size of
1335  * the max free space.
1336  */
1337 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1338                                struct btrfs_device *device, u64 num_bytes,
1339                                u64 search_start, u64 *start, u64 *len)
1340 {
1341         struct btrfs_fs_info *fs_info = device->fs_info;
1342         struct btrfs_root *root = fs_info->dev_root;
1343         struct btrfs_key key;
1344         struct btrfs_dev_extent *dev_extent;
1345         struct btrfs_path *path;
1346         u64 hole_size;
1347         u64 max_hole_start;
1348         u64 max_hole_size;
1349         u64 extent_end;
1350         u64 search_end = device->total_bytes;
1351         int ret;
1352         int slot;
1353         struct extent_buffer *l;
1354
1355         /*
1356          * We don't want to overwrite the superblock on the drive nor any area
1357          * used by the boot loader (grub for example), so we make sure to start
1358          * at an offset of at least 1MB.
1359          */
1360         search_start = max_t(u64, search_start, SZ_1M);
1361
1362         path = btrfs_alloc_path();
1363         if (!path)
1364                 return -ENOMEM;
1365
1366         max_hole_start = search_start;
1367         max_hole_size = 0;
1368
1369 again:
1370         if (search_start >= search_end ||
1371                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1372                 ret = -ENOSPC;
1373                 goto out;
1374         }
1375
1376         path->reada = READA_FORWARD;
1377         path->search_commit_root = 1;
1378         path->skip_locking = 1;
1379
1380         key.objectid = device->devid;
1381         key.offset = search_start;
1382         key.type = BTRFS_DEV_EXTENT_KEY;
1383
1384         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1385         if (ret < 0)
1386                 goto out;
1387         if (ret > 0) {
1388                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1389                 if (ret < 0)
1390                         goto out;
1391         }
1392
1393         while (1) {
1394                 l = path->nodes[0];
1395                 slot = path->slots[0];
1396                 if (slot >= btrfs_header_nritems(l)) {
1397                         ret = btrfs_next_leaf(root, path);
1398                         if (ret == 0)
1399                                 continue;
1400                         if (ret < 0)
1401                                 goto out;
1402
1403                         break;
1404                 }
1405                 btrfs_item_key_to_cpu(l, &key, slot);
1406
1407                 if (key.objectid < device->devid)
1408                         goto next;
1409
1410                 if (key.objectid > device->devid)
1411                         break;
1412
1413                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1414                         goto next;
1415
1416                 if (key.offset > search_start) {
1417                         hole_size = key.offset - search_start;
1418
1419                         /*
1420                          * Have to check before we set max_hole_start, otherwise
1421                          * we could end up sending back this offset anyway.
1422                          */
1423                         if (contains_pending_extent(transaction, device,
1424                                                     &search_start,
1425                                                     hole_size)) {
1426                                 if (key.offset >= search_start) {
1427                                         hole_size = key.offset - search_start;
1428                                 } else {
1429                                         WARN_ON_ONCE(1);
1430                                         hole_size = 0;
1431                                 }
1432                         }
1433
1434                         if (hole_size > max_hole_size) {
1435                                 max_hole_start = search_start;
1436                                 max_hole_size = hole_size;
1437                         }
1438
1439                         /*
1440                          * If this free space is greater than which we need,
1441                          * it must be the max free space that we have found
1442                          * until now, so max_hole_start must point to the start
1443                          * of this free space and the length of this free space
1444                          * is stored in max_hole_size. Thus, we return
1445                          * max_hole_start and max_hole_size and go back to the
1446                          * caller.
1447                          */
1448                         if (hole_size >= num_bytes) {
1449                                 ret = 0;
1450                                 goto out;
1451                         }
1452                 }
1453
1454                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1455                 extent_end = key.offset + btrfs_dev_extent_length(l,
1456                                                                   dev_extent);
1457                 if (extent_end > search_start)
1458                         search_start = extent_end;
1459 next:
1460                 path->slots[0]++;
1461                 cond_resched();
1462         }
1463
1464         /*
1465          * At this point, search_start should be the end of
1466          * allocated dev extents, and when shrinking the device,
1467          * search_end may be smaller than search_start.
1468          */
1469         if (search_end > search_start) {
1470                 hole_size = search_end - search_start;
1471
1472                 if (contains_pending_extent(transaction, device, &search_start,
1473                                             hole_size)) {
1474                         btrfs_release_path(path);
1475                         goto again;
1476                 }
1477
1478                 if (hole_size > max_hole_size) {
1479                         max_hole_start = search_start;
1480                         max_hole_size = hole_size;
1481                 }
1482         }
1483
1484         /* See above. */
1485         if (max_hole_size < num_bytes)
1486                 ret = -ENOSPC;
1487         else
1488                 ret = 0;
1489
1490 out:
1491         btrfs_free_path(path);
1492         *start = max_hole_start;
1493         if (len)
1494                 *len = max_hole_size;
1495         return ret;
1496 }
1497
1498 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1499                          struct btrfs_device *device, u64 num_bytes,
1500                          u64 *start, u64 *len)
1501 {
1502         /* FIXME use last free of some kind */
1503         return find_free_dev_extent_start(trans->transaction, device,
1504                                           num_bytes, 0, start, len);
1505 }
1506
1507 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1508                           struct btrfs_device *device,
1509                           u64 start, u64 *dev_extent_len)
1510 {
1511         struct btrfs_fs_info *fs_info = device->fs_info;
1512         struct btrfs_root *root = fs_info->dev_root;
1513         int ret;
1514         struct btrfs_path *path;
1515         struct btrfs_key key;
1516         struct btrfs_key found_key;
1517         struct extent_buffer *leaf = NULL;
1518         struct btrfs_dev_extent *extent = NULL;
1519
1520         path = btrfs_alloc_path();
1521         if (!path)
1522                 return -ENOMEM;
1523
1524         key.objectid = device->devid;
1525         key.offset = start;
1526         key.type = BTRFS_DEV_EXTENT_KEY;
1527 again:
1528         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1529         if (ret > 0) {
1530                 ret = btrfs_previous_item(root, path, key.objectid,
1531                                           BTRFS_DEV_EXTENT_KEY);
1532                 if (ret)
1533                         goto out;
1534                 leaf = path->nodes[0];
1535                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1536                 extent = btrfs_item_ptr(leaf, path->slots[0],
1537                                         struct btrfs_dev_extent);
1538                 BUG_ON(found_key.offset > start || found_key.offset +
1539                        btrfs_dev_extent_length(leaf, extent) < start);
1540                 key = found_key;
1541                 btrfs_release_path(path);
1542                 goto again;
1543         } else if (ret == 0) {
1544                 leaf = path->nodes[0];
1545                 extent = btrfs_item_ptr(leaf, path->slots[0],
1546                                         struct btrfs_dev_extent);
1547         } else {
1548                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1549                 goto out;
1550         }
1551
1552         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1553
1554         ret = btrfs_del_item(trans, root, path);
1555         if (ret) {
1556                 btrfs_handle_fs_error(fs_info, ret,
1557                                       "Failed to remove dev extent item");
1558         } else {
1559                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1560         }
1561 out:
1562         btrfs_free_path(path);
1563         return ret;
1564 }
1565
1566 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1567                                   struct btrfs_device *device,
1568                                   u64 chunk_offset, u64 start, u64 num_bytes)
1569 {
1570         int ret;
1571         struct btrfs_path *path;
1572         struct btrfs_fs_info *fs_info = device->fs_info;
1573         struct btrfs_root *root = fs_info->dev_root;
1574         struct btrfs_dev_extent *extent;
1575         struct extent_buffer *leaf;
1576         struct btrfs_key key;
1577
1578         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1579         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1580         path = btrfs_alloc_path();
1581         if (!path)
1582                 return -ENOMEM;
1583
1584         key.objectid = device->devid;
1585         key.offset = start;
1586         key.type = BTRFS_DEV_EXTENT_KEY;
1587         ret = btrfs_insert_empty_item(trans, root, path, &key,
1588                                       sizeof(*extent));
1589         if (ret)
1590                 goto out;
1591
1592         leaf = path->nodes[0];
1593         extent = btrfs_item_ptr(leaf, path->slots[0],
1594                                 struct btrfs_dev_extent);
1595         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1596                                         BTRFS_CHUNK_TREE_OBJECTID);
1597         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1598                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1599         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1600
1601         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1602         btrfs_mark_buffer_dirty(leaf);
1603 out:
1604         btrfs_free_path(path);
1605         return ret;
1606 }
1607
1608 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1609 {
1610         struct extent_map_tree *em_tree;
1611         struct extent_map *em;
1612         struct rb_node *n;
1613         u64 ret = 0;
1614
1615         em_tree = &fs_info->mapping_tree.map_tree;
1616         read_lock(&em_tree->lock);
1617         n = rb_last(&em_tree->map);
1618         if (n) {
1619                 em = rb_entry(n, struct extent_map, rb_node);
1620                 ret = em->start + em->len;
1621         }
1622         read_unlock(&em_tree->lock);
1623
1624         return ret;
1625 }
1626
1627 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1628                                     u64 *devid_ret)
1629 {
1630         int ret;
1631         struct btrfs_key key;
1632         struct btrfs_key found_key;
1633         struct btrfs_path *path;
1634
1635         path = btrfs_alloc_path();
1636         if (!path)
1637                 return -ENOMEM;
1638
1639         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1640         key.type = BTRFS_DEV_ITEM_KEY;
1641         key.offset = (u64)-1;
1642
1643         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1644         if (ret < 0)
1645                 goto error;
1646
1647         BUG_ON(ret == 0); /* Corruption */
1648
1649         ret = btrfs_previous_item(fs_info->chunk_root, path,
1650                                   BTRFS_DEV_ITEMS_OBJECTID,
1651                                   BTRFS_DEV_ITEM_KEY);
1652         if (ret) {
1653                 *devid_ret = 1;
1654         } else {
1655                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1656                                       path->slots[0]);
1657                 *devid_ret = found_key.offset + 1;
1658         }
1659         ret = 0;
1660 error:
1661         btrfs_free_path(path);
1662         return ret;
1663 }
1664
1665 /*
1666  * the device information is stored in the chunk root
1667  * the btrfs_device struct should be fully filled in
1668  */
1669 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1670                             struct btrfs_fs_info *fs_info,
1671                             struct btrfs_device *device)
1672 {
1673         struct btrfs_root *root = fs_info->chunk_root;
1674         int ret;
1675         struct btrfs_path *path;
1676         struct btrfs_dev_item *dev_item;
1677         struct extent_buffer *leaf;
1678         struct btrfs_key key;
1679         unsigned long ptr;
1680
1681         path = btrfs_alloc_path();
1682         if (!path)
1683                 return -ENOMEM;
1684
1685         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1686         key.type = BTRFS_DEV_ITEM_KEY;
1687         key.offset = device->devid;
1688
1689         ret = btrfs_insert_empty_item(trans, root, path, &key,
1690                                       sizeof(*dev_item));
1691         if (ret)
1692                 goto out;
1693
1694         leaf = path->nodes[0];
1695         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1696
1697         btrfs_set_device_id(leaf, dev_item, device->devid);
1698         btrfs_set_device_generation(leaf, dev_item, 0);
1699         btrfs_set_device_type(leaf, dev_item, device->type);
1700         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1701         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1702         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1703         btrfs_set_device_total_bytes(leaf, dev_item,
1704                                      btrfs_device_get_disk_total_bytes(device));
1705         btrfs_set_device_bytes_used(leaf, dev_item,
1706                                     btrfs_device_get_bytes_used(device));
1707         btrfs_set_device_group(leaf, dev_item, 0);
1708         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1709         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1710         btrfs_set_device_start_offset(leaf, dev_item, 0);
1711
1712         ptr = btrfs_device_uuid(dev_item);
1713         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1714         ptr = btrfs_device_fsid(dev_item);
1715         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1716         btrfs_mark_buffer_dirty(leaf);
1717
1718         ret = 0;
1719 out:
1720         btrfs_free_path(path);
1721         return ret;
1722 }
1723
1724 /*
1725  * Function to update ctime/mtime for a given device path.
1726  * Mainly used for ctime/mtime based probe like libblkid.
1727  */
1728 static void update_dev_time(const char *path_name)
1729 {
1730         struct file *filp;
1731
1732         filp = filp_open(path_name, O_RDWR, 0);
1733         if (IS_ERR(filp))
1734                 return;
1735         file_update_time(filp);
1736         filp_close(filp, NULL);
1737 }
1738
1739 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1740                              struct btrfs_device *device)
1741 {
1742         struct btrfs_root *root = fs_info->chunk_root;
1743         int ret;
1744         struct btrfs_path *path;
1745         struct btrfs_key key;
1746         struct btrfs_trans_handle *trans;
1747
1748         path = btrfs_alloc_path();
1749         if (!path)
1750                 return -ENOMEM;
1751
1752         trans = btrfs_start_transaction(root, 0);
1753         if (IS_ERR(trans)) {
1754                 btrfs_free_path(path);
1755                 return PTR_ERR(trans);
1756         }
1757         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1758         key.type = BTRFS_DEV_ITEM_KEY;
1759         key.offset = device->devid;
1760
1761         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1762         if (ret) {
1763                 if (ret > 0)
1764                         ret = -ENOENT;
1765                 btrfs_abort_transaction(trans, ret);
1766                 btrfs_end_transaction(trans);
1767                 goto out;
1768         }
1769
1770         ret = btrfs_del_item(trans, root, path);
1771         if (ret) {
1772                 btrfs_abort_transaction(trans, ret);
1773                 btrfs_end_transaction(trans);
1774         }
1775
1776 out:
1777         btrfs_free_path(path);
1778         if (!ret)
1779                 ret = btrfs_commit_transaction(trans);
1780         return ret;
1781 }
1782
1783 /*
1784  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1785  * filesystem. It's up to the caller to adjust that number regarding eg. device
1786  * replace.
1787  */
1788 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1789                 u64 num_devices)
1790 {
1791         u64 all_avail;
1792         unsigned seq;
1793         int i;
1794
1795         do {
1796                 seq = read_seqbegin(&fs_info->profiles_lock);
1797
1798                 all_avail = fs_info->avail_data_alloc_bits |
1799                             fs_info->avail_system_alloc_bits |
1800                             fs_info->avail_metadata_alloc_bits;
1801         } while (read_seqretry(&fs_info->profiles_lock, seq));
1802
1803         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1804                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1805                         continue;
1806
1807                 if (num_devices < btrfs_raid_array[i].devs_min) {
1808                         int ret = btrfs_raid_array[i].mindev_error;
1809
1810                         if (ret)
1811                                 return ret;
1812                 }
1813         }
1814
1815         return 0;
1816 }
1817
1818 static struct btrfs_device * btrfs_find_next_active_device(
1819                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1820 {
1821         struct btrfs_device *next_device;
1822
1823         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1824                 if (next_device != device &&
1825                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1826                     && next_device->bdev)
1827                         return next_device;
1828         }
1829
1830         return NULL;
1831 }
1832
1833 /*
1834  * Helper function to check if the given device is part of s_bdev / latest_bdev
1835  * and replace it with the provided or the next active device, in the context
1836  * where this function called, there should be always be another device (or
1837  * this_dev) which is active.
1838  */
1839 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1840                 struct btrfs_device *device, struct btrfs_device *this_dev)
1841 {
1842         struct btrfs_device *next_device;
1843
1844         if (this_dev)
1845                 next_device = this_dev;
1846         else
1847                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1848                                                                 device);
1849         ASSERT(next_device);
1850
1851         if (fs_info->sb->s_bdev &&
1852                         (fs_info->sb->s_bdev == device->bdev))
1853                 fs_info->sb->s_bdev = next_device->bdev;
1854
1855         if (fs_info->fs_devices->latest_bdev == device->bdev)
1856                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1857 }
1858
1859 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1860                 u64 devid)
1861 {
1862         struct btrfs_device *device;
1863         struct btrfs_fs_devices *cur_devices;
1864         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1865         u64 num_devices;
1866         int ret = 0;
1867
1868         mutex_lock(&uuid_mutex);
1869
1870         num_devices = fs_devices->num_devices;
1871         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1872         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1873                 WARN_ON(num_devices < 1);
1874                 num_devices--;
1875         }
1876         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1877
1878         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1879         if (ret)
1880                 goto out;
1881
1882         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1883                                            &device);
1884         if (ret)
1885                 goto out;
1886
1887         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1888                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1889                 goto out;
1890         }
1891
1892         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1893             fs_info->fs_devices->rw_devices == 1) {
1894                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1895                 goto out;
1896         }
1897
1898         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1899                 mutex_lock(&fs_info->chunk_mutex);
1900                 list_del_init(&device->dev_alloc_list);
1901                 device->fs_devices->rw_devices--;
1902                 mutex_unlock(&fs_info->chunk_mutex);
1903         }
1904
1905         mutex_unlock(&uuid_mutex);
1906         ret = btrfs_shrink_device(device, 0);
1907         mutex_lock(&uuid_mutex);
1908         if (ret)
1909                 goto error_undo;
1910
1911         /*
1912          * TODO: the superblock still includes this device in its num_devices
1913          * counter although write_all_supers() is not locked out. This
1914          * could give a filesystem state which requires a degraded mount.
1915          */
1916         ret = btrfs_rm_dev_item(fs_info, device);
1917         if (ret)
1918                 goto error_undo;
1919
1920         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1921         btrfs_scrub_cancel_dev(fs_info, device);
1922
1923         /*
1924          * the device list mutex makes sure that we don't change
1925          * the device list while someone else is writing out all
1926          * the device supers. Whoever is writing all supers, should
1927          * lock the device list mutex before getting the number of
1928          * devices in the super block (super_copy). Conversely,
1929          * whoever updates the number of devices in the super block
1930          * (super_copy) should hold the device list mutex.
1931          */
1932
1933         /*
1934          * In normal cases the cur_devices == fs_devices. But in case
1935          * of deleting a seed device, the cur_devices should point to
1936          * its own fs_devices listed under the fs_devices->seed.
1937          */
1938         cur_devices = device->fs_devices;
1939         mutex_lock(&fs_devices->device_list_mutex);
1940         list_del_rcu(&device->dev_list);
1941
1942         cur_devices->num_devices--;
1943         cur_devices->total_devices--;
1944         /* Update total_devices of the parent fs_devices if it's seed */
1945         if (cur_devices != fs_devices)
1946                 fs_devices->total_devices--;
1947
1948         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1949                 cur_devices->missing_devices--;
1950
1951         btrfs_assign_next_active_device(fs_info, device, NULL);
1952
1953         if (device->bdev) {
1954                 cur_devices->open_devices--;
1955                 /* remove sysfs entry */
1956                 btrfs_sysfs_rm_device_link(fs_devices, device);
1957         }
1958
1959         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
1960         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
1961         mutex_unlock(&fs_devices->device_list_mutex);
1962
1963         /*
1964          * at this point, the device is zero sized and detached from
1965          * the devices list.  All that's left is to zero out the old
1966          * supers and free the device.
1967          */
1968         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1969                 btrfs_scratch_superblocks(device->bdev, device->name->str);
1970
1971         btrfs_close_bdev(device);
1972         call_rcu(&device->rcu, free_device_rcu);
1973
1974         if (cur_devices->open_devices == 0) {
1975                 while (fs_devices) {
1976                         if (fs_devices->seed == cur_devices) {
1977                                 fs_devices->seed = cur_devices->seed;
1978                                 break;
1979                         }
1980                         fs_devices = fs_devices->seed;
1981                 }
1982                 cur_devices->seed = NULL;
1983                 close_fs_devices(cur_devices);
1984                 free_fs_devices(cur_devices);
1985         }
1986
1987 out:
1988         mutex_unlock(&uuid_mutex);
1989         return ret;
1990
1991 error_undo:
1992         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1993                 mutex_lock(&fs_info->chunk_mutex);
1994                 list_add(&device->dev_alloc_list,
1995                          &fs_devices->alloc_list);
1996                 device->fs_devices->rw_devices++;
1997                 mutex_unlock(&fs_info->chunk_mutex);
1998         }
1999         goto out;
2000 }
2001
2002 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
2003                                         struct btrfs_device *srcdev)
2004 {
2005         struct btrfs_fs_devices *fs_devices;
2006
2007         lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2008
2009         /*
2010          * in case of fs with no seed, srcdev->fs_devices will point
2011          * to fs_devices of fs_info. However when the dev being replaced is
2012          * a seed dev it will point to the seed's local fs_devices. In short
2013          * srcdev will have its correct fs_devices in both the cases.
2014          */
2015         fs_devices = srcdev->fs_devices;
2016
2017         list_del_rcu(&srcdev->dev_list);
2018         list_del(&srcdev->dev_alloc_list);
2019         fs_devices->num_devices--;
2020         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2021                 fs_devices->missing_devices--;
2022
2023         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2024                 fs_devices->rw_devices--;
2025
2026         if (srcdev->bdev)
2027                 fs_devices->open_devices--;
2028 }
2029
2030 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2031                                       struct btrfs_device *srcdev)
2032 {
2033         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2034
2035         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2036                 /* zero out the old super if it is writable */
2037                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2038         }
2039
2040         btrfs_close_bdev(srcdev);
2041         call_rcu(&srcdev->rcu, free_device_rcu);
2042
2043         /* if this is no devs we rather delete the fs_devices */
2044         if (!fs_devices->num_devices) {
2045                 struct btrfs_fs_devices *tmp_fs_devices;
2046
2047                 /*
2048                  * On a mounted FS, num_devices can't be zero unless it's a
2049                  * seed. In case of a seed device being replaced, the replace
2050                  * target added to the sprout FS, so there will be no more
2051                  * device left under the seed FS.
2052                  */
2053                 ASSERT(fs_devices->seeding);
2054
2055                 tmp_fs_devices = fs_info->fs_devices;
2056                 while (tmp_fs_devices) {
2057                         if (tmp_fs_devices->seed == fs_devices) {
2058                                 tmp_fs_devices->seed = fs_devices->seed;
2059                                 break;
2060                         }
2061                         tmp_fs_devices = tmp_fs_devices->seed;
2062                 }
2063                 fs_devices->seed = NULL;
2064                 close_fs_devices(fs_devices);
2065                 free_fs_devices(fs_devices);
2066         }
2067 }
2068
2069 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2070                                       struct btrfs_device *tgtdev)
2071 {
2072         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2073
2074         WARN_ON(!tgtdev);
2075         mutex_lock(&fs_devices->device_list_mutex);
2076
2077         btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2078
2079         if (tgtdev->bdev)
2080                 fs_devices->open_devices--;
2081
2082         fs_devices->num_devices--;
2083
2084         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2085
2086         list_del_rcu(&tgtdev->dev_list);
2087
2088         mutex_unlock(&fs_devices->device_list_mutex);
2089
2090         /*
2091          * The update_dev_time() with in btrfs_scratch_superblocks()
2092          * may lead to a call to btrfs_show_devname() which will try
2093          * to hold device_list_mutex. And here this device
2094          * is already out of device list, so we don't have to hold
2095          * the device_list_mutex lock.
2096          */
2097         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2098
2099         btrfs_close_bdev(tgtdev);
2100         call_rcu(&tgtdev->rcu, free_device_rcu);
2101 }
2102
2103 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2104                                      const char *device_path,
2105                                      struct btrfs_device **device)
2106 {
2107         int ret = 0;
2108         struct btrfs_super_block *disk_super;
2109         u64 devid;
2110         u8 *dev_uuid;
2111         struct block_device *bdev;
2112         struct buffer_head *bh;
2113
2114         *device = NULL;
2115         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2116                                     fs_info->bdev_holder, 0, &bdev, &bh);
2117         if (ret)
2118                 return ret;
2119         disk_super = (struct btrfs_super_block *)bh->b_data;
2120         devid = btrfs_stack_device_id(&disk_super->dev_item);
2121         dev_uuid = disk_super->dev_item.uuid;
2122         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2123         brelse(bh);
2124         if (!*device)
2125                 ret = -ENOENT;
2126         blkdev_put(bdev, FMODE_READ);
2127         return ret;
2128 }
2129
2130 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2131                                          const char *device_path,
2132                                          struct btrfs_device **device)
2133 {
2134         *device = NULL;
2135         if (strcmp(device_path, "missing") == 0) {
2136                 struct list_head *devices;
2137                 struct btrfs_device *tmp;
2138
2139                 devices = &fs_info->fs_devices->devices;
2140                 list_for_each_entry(tmp, devices, dev_list) {
2141                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2142                                         &tmp->dev_state) && !tmp->bdev) {
2143                                 *device = tmp;
2144                                 break;
2145                         }
2146                 }
2147
2148                 if (!*device)
2149                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2150
2151                 return 0;
2152         } else {
2153                 return btrfs_find_device_by_path(fs_info, device_path, device);
2154         }
2155 }
2156
2157 /*
2158  * Lookup a device given by device id, or the path if the id is 0.
2159  */
2160 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2161                                  const char *devpath,
2162                                  struct btrfs_device **device)
2163 {
2164         int ret;
2165
2166         if (devid) {
2167                 ret = 0;
2168                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2169                 if (!*device)
2170                         ret = -ENOENT;
2171         } else {
2172                 if (!devpath || !devpath[0])
2173                         return -EINVAL;
2174
2175                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2176                                                            device);
2177         }
2178         return ret;
2179 }
2180
2181 /*
2182  * does all the dirty work required for changing file system's UUID.
2183  */
2184 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2185 {
2186         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2187         struct btrfs_fs_devices *old_devices;
2188         struct btrfs_fs_devices *seed_devices;
2189         struct btrfs_super_block *disk_super = fs_info->super_copy;
2190         struct btrfs_device *device;
2191         u64 super_flags;
2192
2193         lockdep_assert_held(&uuid_mutex);
2194         if (!fs_devices->seeding)
2195                 return -EINVAL;
2196
2197         seed_devices = alloc_fs_devices(NULL);
2198         if (IS_ERR(seed_devices))
2199                 return PTR_ERR(seed_devices);
2200
2201         old_devices = clone_fs_devices(fs_devices);
2202         if (IS_ERR(old_devices)) {
2203                 kfree(seed_devices);
2204                 return PTR_ERR(old_devices);
2205         }
2206
2207         list_add(&old_devices->fs_list, &fs_uuids);
2208
2209         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2210         seed_devices->opened = 1;
2211         INIT_LIST_HEAD(&seed_devices->devices);
2212         INIT_LIST_HEAD(&seed_devices->alloc_list);
2213         mutex_init(&seed_devices->device_list_mutex);
2214
2215         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2216         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2217                               synchronize_rcu);
2218         list_for_each_entry(device, &seed_devices->devices, dev_list)
2219                 device->fs_devices = seed_devices;
2220
2221         mutex_lock(&fs_info->chunk_mutex);
2222         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2223         mutex_unlock(&fs_info->chunk_mutex);
2224
2225         fs_devices->seeding = 0;
2226         fs_devices->num_devices = 0;
2227         fs_devices->open_devices = 0;
2228         fs_devices->missing_devices = 0;
2229         fs_devices->rotating = 0;
2230         fs_devices->seed = seed_devices;
2231
2232         generate_random_uuid(fs_devices->fsid);
2233         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2234         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2235         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2236
2237         super_flags = btrfs_super_flags(disk_super) &
2238                       ~BTRFS_SUPER_FLAG_SEEDING;
2239         btrfs_set_super_flags(disk_super, super_flags);
2240
2241         return 0;
2242 }
2243
2244 /*
2245  * Store the expected generation for seed devices in device items.
2246  */
2247 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2248                                struct btrfs_fs_info *fs_info)
2249 {
2250         struct btrfs_root *root = fs_info->chunk_root;
2251         struct btrfs_path *path;
2252         struct extent_buffer *leaf;
2253         struct btrfs_dev_item *dev_item;
2254         struct btrfs_device *device;
2255         struct btrfs_key key;
2256         u8 fs_uuid[BTRFS_FSID_SIZE];
2257         u8 dev_uuid[BTRFS_UUID_SIZE];
2258         u64 devid;
2259         int ret;
2260
2261         path = btrfs_alloc_path();
2262         if (!path)
2263                 return -ENOMEM;
2264
2265         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2266         key.offset = 0;
2267         key.type = BTRFS_DEV_ITEM_KEY;
2268
2269         while (1) {
2270                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2271                 if (ret < 0)
2272                         goto error;
2273
2274                 leaf = path->nodes[0];
2275 next_slot:
2276                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2277                         ret = btrfs_next_leaf(root, path);
2278                         if (ret > 0)
2279                                 break;
2280                         if (ret < 0)
2281                                 goto error;
2282                         leaf = path->nodes[0];
2283                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2284                         btrfs_release_path(path);
2285                         continue;
2286                 }
2287
2288                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2289                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2290                     key.type != BTRFS_DEV_ITEM_KEY)
2291                         break;
2292
2293                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2294                                           struct btrfs_dev_item);
2295                 devid = btrfs_device_id(leaf, dev_item);
2296                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2297                                    BTRFS_UUID_SIZE);
2298                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2299                                    BTRFS_FSID_SIZE);
2300                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2301                 BUG_ON(!device); /* Logic error */
2302
2303                 if (device->fs_devices->seeding) {
2304                         btrfs_set_device_generation(leaf, dev_item,
2305                                                     device->generation);
2306                         btrfs_mark_buffer_dirty(leaf);
2307                 }
2308
2309                 path->slots[0]++;
2310                 goto next_slot;
2311         }
2312         ret = 0;
2313 error:
2314         btrfs_free_path(path);
2315         return ret;
2316 }
2317
2318 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2319 {
2320         struct btrfs_root *root = fs_info->dev_root;
2321         struct request_queue *q;
2322         struct btrfs_trans_handle *trans;
2323         struct btrfs_device *device;
2324         struct block_device *bdev;
2325         struct super_block *sb = fs_info->sb;
2326         struct rcu_string *name;
2327         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2328         u64 tmp;
2329         int seeding_dev = 0;
2330         int ret = 0;
2331         bool unlocked = false;
2332
2333         if (sb_rdonly(sb) && !fs_devices->seeding)
2334                 return -EROFS;
2335
2336         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2337                                   fs_info->bdev_holder);
2338         if (IS_ERR(bdev))
2339                 return PTR_ERR(bdev);
2340
2341         if (fs_devices->seeding) {
2342                 seeding_dev = 1;
2343                 down_write(&sb->s_umount);
2344                 mutex_lock(&uuid_mutex);
2345         }
2346
2347         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2348
2349         mutex_lock(&fs_devices->device_list_mutex);
2350         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2351                 if (device->bdev == bdev) {
2352                         ret = -EEXIST;
2353                         mutex_unlock(
2354                                 &fs_devices->device_list_mutex);
2355                         goto error;
2356                 }
2357         }
2358         mutex_unlock(&fs_devices->device_list_mutex);
2359
2360         device = btrfs_alloc_device(fs_info, NULL, NULL);
2361         if (IS_ERR(device)) {
2362                 /* we can safely leave the fs_devices entry around */
2363                 ret = PTR_ERR(device);
2364                 goto error;
2365         }
2366
2367         name = rcu_string_strdup(device_path, GFP_KERNEL);
2368         if (!name) {
2369                 ret = -ENOMEM;
2370                 goto error_free_device;
2371         }
2372         rcu_assign_pointer(device->name, name);
2373
2374         trans = btrfs_start_transaction(root, 0);
2375         if (IS_ERR(trans)) {
2376                 ret = PTR_ERR(trans);
2377                 goto error_free_device;
2378         }
2379
2380         q = bdev_get_queue(bdev);
2381         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2382         device->generation = trans->transid;
2383         device->io_width = fs_info->sectorsize;
2384         device->io_align = fs_info->sectorsize;
2385         device->sector_size = fs_info->sectorsize;
2386         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2387                                          fs_info->sectorsize);
2388         device->disk_total_bytes = device->total_bytes;
2389         device->commit_total_bytes = device->total_bytes;
2390         device->fs_info = fs_info;
2391         device->bdev = bdev;
2392         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2393         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2394         device->mode = FMODE_EXCL;
2395         device->dev_stats_valid = 1;
2396         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2397
2398         if (seeding_dev) {
2399                 sb->s_flags &= ~SB_RDONLY;
2400                 ret = btrfs_prepare_sprout(fs_info);
2401                 if (ret) {
2402                         btrfs_abort_transaction(trans, ret);
2403                         goto error_trans;
2404                 }
2405         }
2406
2407         device->fs_devices = fs_devices;
2408
2409         mutex_lock(&fs_devices->device_list_mutex);
2410         mutex_lock(&fs_info->chunk_mutex);
2411         list_add_rcu(&device->dev_list, &fs_devices->devices);
2412         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2413         fs_devices->num_devices++;
2414         fs_devices->open_devices++;
2415         fs_devices->rw_devices++;
2416         fs_devices->total_devices++;
2417         fs_devices->total_rw_bytes += device->total_bytes;
2418
2419         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2420
2421         if (!blk_queue_nonrot(q))
2422                 fs_devices->rotating = 1;
2423
2424         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2425         btrfs_set_super_total_bytes(fs_info->super_copy,
2426                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2427
2428         tmp = btrfs_super_num_devices(fs_info->super_copy);
2429         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2430
2431         /* add sysfs device entry */
2432         btrfs_sysfs_add_device_link(fs_devices, device);
2433
2434         /*
2435          * we've got more storage, clear any full flags on the space
2436          * infos
2437          */
2438         btrfs_clear_space_info_full(fs_info);
2439
2440         mutex_unlock(&fs_info->chunk_mutex);
2441         mutex_unlock(&fs_devices->device_list_mutex);
2442
2443         if (seeding_dev) {
2444                 mutex_lock(&fs_info->chunk_mutex);
2445                 ret = init_first_rw_device(trans, fs_info);
2446                 mutex_unlock(&fs_info->chunk_mutex);
2447                 if (ret) {
2448                         btrfs_abort_transaction(trans, ret);
2449                         goto error_sysfs;
2450                 }
2451         }
2452
2453         ret = btrfs_add_dev_item(trans, fs_info, device);
2454         if (ret) {
2455                 btrfs_abort_transaction(trans, ret);
2456                 goto error_sysfs;
2457         }
2458
2459         if (seeding_dev) {
2460                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2461
2462                 ret = btrfs_finish_sprout(trans, fs_info);
2463                 if (ret) {
2464                         btrfs_abort_transaction(trans, ret);
2465                         goto error_sysfs;
2466                 }
2467
2468                 /* Sprouting would change fsid of the mounted root,
2469                  * so rename the fsid on the sysfs
2470                  */
2471                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2472                                                 fs_info->fsid);
2473                 if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2474                         btrfs_warn(fs_info,
2475                                    "sysfs: failed to create fsid for sprout");
2476         }
2477
2478         ret = btrfs_commit_transaction(trans);
2479
2480         if (seeding_dev) {
2481                 mutex_unlock(&uuid_mutex);
2482                 up_write(&sb->s_umount);
2483                 unlocked = true;
2484
2485                 if (ret) /* transaction commit */
2486                         return ret;
2487
2488                 ret = btrfs_relocate_sys_chunks(fs_info);
2489                 if (ret < 0)
2490                         btrfs_handle_fs_error(fs_info, ret,
2491                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2492                 trans = btrfs_attach_transaction(root);
2493                 if (IS_ERR(trans)) {
2494                         if (PTR_ERR(trans) == -ENOENT)
2495                                 return 0;
2496                         ret = PTR_ERR(trans);
2497                         trans = NULL;
2498                         goto error_sysfs;
2499                 }
2500                 ret = btrfs_commit_transaction(trans);
2501         }
2502
2503         /* Update ctime/mtime for libblkid */
2504         update_dev_time(device_path);
2505         return ret;
2506
2507 error_sysfs:
2508         btrfs_sysfs_rm_device_link(fs_devices, device);
2509 error_trans:
2510         if (seeding_dev)
2511                 sb->s_flags |= SB_RDONLY;
2512         if (trans)
2513                 btrfs_end_transaction(trans);
2514 error_free_device:
2515         btrfs_free_device(device);
2516 error:
2517         blkdev_put(bdev, FMODE_EXCL);
2518         if (seeding_dev && !unlocked) {
2519                 mutex_unlock(&uuid_mutex);
2520                 up_write(&sb->s_umount);
2521         }
2522         return ret;
2523 }
2524
2525 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2526                                         struct btrfs_device *device)
2527 {
2528         int ret;
2529         struct btrfs_path *path;
2530         struct btrfs_root *root = device->fs_info->chunk_root;
2531         struct btrfs_dev_item *dev_item;
2532         struct extent_buffer *leaf;
2533         struct btrfs_key key;
2534
2535         path = btrfs_alloc_path();
2536         if (!path)
2537                 return -ENOMEM;
2538
2539         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2540         key.type = BTRFS_DEV_ITEM_KEY;
2541         key.offset = device->devid;
2542
2543         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2544         if (ret < 0)
2545                 goto out;
2546
2547         if (ret > 0) {
2548                 ret = -ENOENT;
2549                 goto out;
2550         }
2551
2552         leaf = path->nodes[0];
2553         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2554
2555         btrfs_set_device_id(leaf, dev_item, device->devid);
2556         btrfs_set_device_type(leaf, dev_item, device->type);
2557         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2558         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2559         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2560         btrfs_set_device_total_bytes(leaf, dev_item,
2561                                      btrfs_device_get_disk_total_bytes(device));
2562         btrfs_set_device_bytes_used(leaf, dev_item,
2563                                     btrfs_device_get_bytes_used(device));
2564         btrfs_mark_buffer_dirty(leaf);
2565
2566 out:
2567         btrfs_free_path(path);
2568         return ret;
2569 }
2570
2571 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2572                       struct btrfs_device *device, u64 new_size)
2573 {
2574         struct btrfs_fs_info *fs_info = device->fs_info;
2575         struct btrfs_super_block *super_copy = fs_info->super_copy;
2576         struct btrfs_fs_devices *fs_devices;
2577         u64 old_total;
2578         u64 diff;
2579
2580         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2581                 return -EACCES;
2582
2583         new_size = round_down(new_size, fs_info->sectorsize);
2584
2585         mutex_lock(&fs_info->chunk_mutex);
2586         old_total = btrfs_super_total_bytes(super_copy);
2587         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2588
2589         if (new_size <= device->total_bytes ||
2590             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2591                 mutex_unlock(&fs_info->chunk_mutex);
2592                 return -EINVAL;
2593         }
2594
2595         fs_devices = fs_info->fs_devices;
2596
2597         btrfs_set_super_total_bytes(super_copy,
2598                         round_down(old_total + diff, fs_info->sectorsize));
2599         device->fs_devices->total_rw_bytes += diff;
2600
2601         btrfs_device_set_total_bytes(device, new_size);
2602         btrfs_device_set_disk_total_bytes(device, new_size);
2603         btrfs_clear_space_info_full(device->fs_info);
2604         if (list_empty(&device->resized_list))
2605                 list_add_tail(&device->resized_list,
2606                               &fs_devices->resized_devices);
2607         mutex_unlock(&fs_info->chunk_mutex);
2608
2609         return btrfs_update_device(trans, device);
2610 }
2611
2612 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2613                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2614 {
2615         struct btrfs_root *root = fs_info->chunk_root;
2616         int ret;
2617         struct btrfs_path *path;
2618         struct btrfs_key key;
2619
2620         path = btrfs_alloc_path();
2621         if (!path)
2622                 return -ENOMEM;
2623
2624         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2625         key.offset = chunk_offset;
2626         key.type = BTRFS_CHUNK_ITEM_KEY;
2627
2628         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2629         if (ret < 0)
2630                 goto out;
2631         else if (ret > 0) { /* Logic error or corruption */
2632                 btrfs_handle_fs_error(fs_info, -ENOENT,
2633                                       "Failed lookup while freeing chunk.");
2634                 ret = -ENOENT;
2635                 goto out;
2636         }
2637
2638         ret = btrfs_del_item(trans, root, path);
2639         if (ret < 0)
2640                 btrfs_handle_fs_error(fs_info, ret,
2641                                       "Failed to delete chunk item.");
2642 out:
2643         btrfs_free_path(path);
2644         return ret;
2645 }
2646
2647 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2648 {
2649         struct btrfs_super_block *super_copy = fs_info->super_copy;
2650         struct btrfs_disk_key *disk_key;
2651         struct btrfs_chunk *chunk;
2652         u8 *ptr;
2653         int ret = 0;
2654         u32 num_stripes;
2655         u32 array_size;
2656         u32 len = 0;
2657         u32 cur;
2658         struct btrfs_key key;
2659
2660         mutex_lock(&fs_info->chunk_mutex);
2661         array_size = btrfs_super_sys_array_size(super_copy);
2662
2663         ptr = super_copy->sys_chunk_array;
2664         cur = 0;
2665
2666         while (cur < array_size) {
2667                 disk_key = (struct btrfs_disk_key *)ptr;
2668                 btrfs_disk_key_to_cpu(&key, disk_key);
2669
2670                 len = sizeof(*disk_key);
2671
2672                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2673                         chunk = (struct btrfs_chunk *)(ptr + len);
2674                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2675                         len += btrfs_chunk_item_size(num_stripes);
2676                 } else {
2677                         ret = -EIO;
2678                         break;
2679                 }
2680                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2681                     key.offset == chunk_offset) {
2682                         memmove(ptr, ptr + len, array_size - (cur + len));
2683                         array_size -= len;
2684                         btrfs_set_super_sys_array_size(super_copy, array_size);
2685                 } else {
2686                         ptr += len;
2687                         cur += len;
2688                 }
2689         }
2690         mutex_unlock(&fs_info->chunk_mutex);
2691         return ret;
2692 }
2693
2694 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2695                                         u64 logical, u64 length)
2696 {
2697         struct extent_map_tree *em_tree;
2698         struct extent_map *em;
2699
2700         em_tree = &fs_info->mapping_tree.map_tree;
2701         read_lock(&em_tree->lock);
2702         em = lookup_extent_mapping(em_tree, logical, length);
2703         read_unlock(&em_tree->lock);
2704
2705         if (!em) {
2706                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2707                            logical, length);
2708                 return ERR_PTR(-EINVAL);
2709         }
2710
2711         if (em->start > logical || em->start + em->len < logical) {
2712                 btrfs_crit(fs_info,
2713                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2714                            logical, length, em->start, em->start + em->len);
2715                 free_extent_map(em);
2716                 return ERR_PTR(-EINVAL);
2717         }
2718
2719         /* callers are responsible for dropping em's ref. */
2720         return em;
2721 }
2722
2723 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2724                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2725 {
2726         struct extent_map *em;
2727         struct map_lookup *map;
2728         u64 dev_extent_len = 0;
2729         int i, ret = 0;
2730         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2731
2732         em = get_chunk_map(fs_info, chunk_offset, 1);
2733         if (IS_ERR(em)) {
2734                 /*
2735                  * This is a logic error, but we don't want to just rely on the
2736                  * user having built with ASSERT enabled, so if ASSERT doesn't
2737                  * do anything we still error out.
2738                  */
2739                 ASSERT(0);
2740                 return PTR_ERR(em);
2741         }
2742         map = em->map_lookup;
2743         mutex_lock(&fs_info->chunk_mutex);
2744         check_system_chunk(trans, map->type);
2745         mutex_unlock(&fs_info->chunk_mutex);
2746
2747         /*
2748          * Take the device list mutex to prevent races with the final phase of
2749          * a device replace operation that replaces the device object associated
2750          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2751          */
2752         mutex_lock(&fs_devices->device_list_mutex);
2753         for (i = 0; i < map->num_stripes; i++) {
2754                 struct btrfs_device *device = map->stripes[i].dev;
2755                 ret = btrfs_free_dev_extent(trans, device,
2756                                             map->stripes[i].physical,
2757                                             &dev_extent_len);
2758                 if (ret) {
2759                         mutex_unlock(&fs_devices->device_list_mutex);
2760                         btrfs_abort_transaction(trans, ret);
2761                         goto out;
2762                 }
2763
2764                 if (device->bytes_used > 0) {
2765                         mutex_lock(&fs_info->chunk_mutex);
2766                         btrfs_device_set_bytes_used(device,
2767                                         device->bytes_used - dev_extent_len);
2768                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2769                         btrfs_clear_space_info_full(fs_info);
2770                         mutex_unlock(&fs_info->chunk_mutex);
2771                 }
2772
2773                 if (map->stripes[i].dev) {
2774                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2775                         if (ret) {
2776                                 mutex_unlock(&fs_devices->device_list_mutex);
2777                                 btrfs_abort_transaction(trans, ret);
2778                                 goto out;
2779                         }
2780                 }
2781         }
2782         mutex_unlock(&fs_devices->device_list_mutex);
2783
2784         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2785         if (ret) {
2786                 btrfs_abort_transaction(trans, ret);
2787                 goto out;
2788         }
2789
2790         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2791
2792         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2793                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2794                 if (ret) {
2795                         btrfs_abort_transaction(trans, ret);
2796                         goto out;
2797                 }
2798         }
2799
2800         ret = btrfs_remove_block_group(trans, chunk_offset, em);
2801         if (ret) {
2802                 btrfs_abort_transaction(trans, ret);
2803                 goto out;
2804         }
2805
2806 out:
2807         /* once for us */
2808         free_extent_map(em);
2809         return ret;
2810 }
2811
2812 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2813 {
2814         struct btrfs_root *root = fs_info->chunk_root;
2815         struct btrfs_trans_handle *trans;
2816         int ret;
2817
2818         /*
2819          * Prevent races with automatic removal of unused block groups.
2820          * After we relocate and before we remove the chunk with offset
2821          * chunk_offset, automatic removal of the block group can kick in,
2822          * resulting in a failure when calling btrfs_remove_chunk() below.
2823          *
2824          * Make sure to acquire this mutex before doing a tree search (dev
2825          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2826          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2827          * we release the path used to search the chunk/dev tree and before
2828          * the current task acquires this mutex and calls us.
2829          */
2830         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2831
2832         ret = btrfs_can_relocate(fs_info, chunk_offset);
2833         if (ret)
2834                 return -ENOSPC;
2835
2836         /* step one, relocate all the extents inside this chunk */
2837         btrfs_scrub_pause(fs_info);
2838         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2839         btrfs_scrub_continue(fs_info);
2840         if (ret)
2841                 return ret;
2842
2843         /*
2844          * We add the kobjects here (and after forcing data chunk creation)
2845          * since relocation is the only place we'll create chunks of a new
2846          * type at runtime.  The only place where we'll remove the last
2847          * chunk of a type is the call immediately below this one.  Even
2848          * so, we're protected against races with the cleaner thread since
2849          * we're covered by the delete_unused_bgs_mutex.
2850          */
2851         btrfs_add_raid_kobjects(fs_info);
2852
2853         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2854                                                      chunk_offset);
2855         if (IS_ERR(trans)) {
2856                 ret = PTR_ERR(trans);
2857                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2858                 return ret;
2859         }
2860
2861         /*
2862          * step two, delete the device extents and the
2863          * chunk tree entries
2864          */
2865         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2866         btrfs_end_transaction(trans);
2867         return ret;
2868 }
2869
2870 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2871 {
2872         struct btrfs_root *chunk_root = fs_info->chunk_root;
2873         struct btrfs_path *path;
2874         struct extent_buffer *leaf;
2875         struct btrfs_chunk *chunk;
2876         struct btrfs_key key;
2877         struct btrfs_key found_key;
2878         u64 chunk_type;
2879         bool retried = false;
2880         int failed = 0;
2881         int ret;
2882
2883         path = btrfs_alloc_path();
2884         if (!path)
2885                 return -ENOMEM;
2886
2887 again:
2888         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2889         key.offset = (u64)-1;
2890         key.type = BTRFS_CHUNK_ITEM_KEY;
2891
2892         while (1) {
2893                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2894                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2895                 if (ret < 0) {
2896                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2897                         goto error;
2898                 }
2899                 BUG_ON(ret == 0); /* Corruption */
2900
2901                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2902                                           key.type);
2903                 if (ret)
2904                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2905                 if (ret < 0)
2906                         goto error;
2907                 if (ret > 0)
2908                         break;
2909
2910                 leaf = path->nodes[0];
2911                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2912
2913                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2914                                        struct btrfs_chunk);
2915                 chunk_type = btrfs_chunk_type(leaf, chunk);
2916                 btrfs_release_path(path);
2917
2918                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2919                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
2920                         if (ret == -ENOSPC)
2921                                 failed++;
2922                         else
2923                                 BUG_ON(ret);
2924                 }
2925                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2926
2927                 if (found_key.offset == 0)
2928                         break;
2929                 key.offset = found_key.offset - 1;
2930         }
2931         ret = 0;
2932         if (failed && !retried) {
2933                 failed = 0;
2934                 retried = true;
2935                 goto again;
2936         } else if (WARN_ON(failed && retried)) {
2937                 ret = -ENOSPC;
2938         }
2939 error:
2940         btrfs_free_path(path);
2941         return ret;
2942 }
2943
2944 /*
2945  * return 1 : allocate a data chunk successfully,
2946  * return <0: errors during allocating a data chunk,
2947  * return 0 : no need to allocate a data chunk.
2948  */
2949 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
2950                                       u64 chunk_offset)
2951 {
2952         struct btrfs_block_group_cache *cache;
2953         u64 bytes_used;
2954         u64 chunk_type;
2955
2956         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2957         ASSERT(cache);
2958         chunk_type = cache->flags;
2959         btrfs_put_block_group(cache);
2960
2961         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
2962                 spin_lock(&fs_info->data_sinfo->lock);
2963                 bytes_used = fs_info->data_sinfo->bytes_used;
2964                 spin_unlock(&fs_info->data_sinfo->lock);
2965
2966                 if (!bytes_used) {
2967                         struct btrfs_trans_handle *trans;
2968                         int ret;
2969
2970                         trans = btrfs_join_transaction(fs_info->tree_root);
2971                         if (IS_ERR(trans))
2972                                 return PTR_ERR(trans);
2973
2974                         ret = btrfs_force_chunk_alloc(trans,
2975                                                       BTRFS_BLOCK_GROUP_DATA);
2976                         btrfs_end_transaction(trans);
2977                         if (ret < 0)
2978                                 return ret;
2979
2980                         btrfs_add_raid_kobjects(fs_info);
2981
2982                         return 1;
2983                 }
2984         }
2985         return 0;
2986 }
2987
2988 static int insert_balance_item(struct btrfs_fs_info *fs_info,
2989                                struct btrfs_balance_control *bctl)
2990 {
2991         struct btrfs_root *root = fs_info->tree_root;
2992         struct btrfs_trans_handle *trans;
2993         struct btrfs_balance_item *item;
2994         struct btrfs_disk_balance_args disk_bargs;
2995         struct btrfs_path *path;
2996         struct extent_buffer *leaf;
2997         struct btrfs_key key;
2998         int ret, err;
2999
3000         path = btrfs_alloc_path();
3001         if (!path)
3002                 return -ENOMEM;
3003
3004         trans = btrfs_start_transaction(root, 0);
3005         if (IS_ERR(trans)) {
3006                 btrfs_free_path(path);
3007                 return PTR_ERR(trans);
3008         }
3009
3010         key.objectid = BTRFS_BALANCE_OBJECTID;
3011         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3012         key.offset = 0;
3013
3014         ret = btrfs_insert_empty_item(trans, root, path, &key,
3015                                       sizeof(*item));
3016         if (ret)
3017                 goto out;
3018
3019         leaf = path->nodes[0];
3020         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3021
3022         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3023
3024         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3025         btrfs_set_balance_data(leaf, item, &disk_bargs);
3026         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3027         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3028         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3029         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3030
3031         btrfs_set_balance_flags(leaf, item, bctl->flags);
3032
3033         btrfs_mark_buffer_dirty(leaf);
3034 out:
3035         btrfs_free_path(path);
3036         err = btrfs_commit_transaction(trans);
3037         if (err && !ret)
3038                 ret = err;
3039         return ret;
3040 }
3041
3042 static int del_balance_item(struct btrfs_fs_info *fs_info)
3043 {
3044         struct btrfs_root *root = fs_info->tree_root;
3045         struct btrfs_trans_handle *trans;
3046         struct btrfs_path *path;
3047         struct btrfs_key key;
3048         int ret, err;
3049
3050         path = btrfs_alloc_path();
3051         if (!path)
3052                 return -ENOMEM;
3053
3054         trans = btrfs_start_transaction(root, 0);
3055         if (IS_ERR(trans)) {
3056                 btrfs_free_path(path);
3057                 return PTR_ERR(trans);
3058         }
3059
3060         key.objectid = BTRFS_BALANCE_OBJECTID;
3061         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3062         key.offset = 0;
3063
3064         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3065         if (ret < 0)
3066                 goto out;
3067         if (ret > 0) {
3068                 ret = -ENOENT;
3069                 goto out;
3070         }
3071
3072         ret = btrfs_del_item(trans, root, path);
3073 out:
3074         btrfs_free_path(path);
3075         err = btrfs_commit_transaction(trans);
3076         if (err && !ret)
3077                 ret = err;
3078         return ret;
3079 }
3080
3081 /*
3082  * This is a heuristic used to reduce the number of chunks balanced on
3083  * resume after balance was interrupted.
3084  */
3085 static void update_balance_args(struct btrfs_balance_control *bctl)
3086 {
3087         /*
3088          * Turn on soft mode for chunk types that were being converted.
3089          */
3090         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3091                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3092         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3093                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3094         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3095                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3096
3097         /*
3098          * Turn on usage filter if is not already used.  The idea is
3099          * that chunks that we have already balanced should be
3100          * reasonably full.  Don't do it for chunks that are being
3101          * converted - that will keep us from relocating unconverted
3102          * (albeit full) chunks.
3103          */
3104         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3105             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3106             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3107                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3108                 bctl->data.usage = 90;
3109         }
3110         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3111             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3112             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3113                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3114                 bctl->sys.usage = 90;
3115         }
3116         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3117             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3118             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3119                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3120                 bctl->meta.usage = 90;
3121         }
3122 }
3123
3124 /*
3125  * Clear the balance status in fs_info and delete the balance item from disk.
3126  */
3127 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3128 {
3129         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3130         int ret;
3131
3132         BUG_ON(!fs_info->balance_ctl);
3133
3134         spin_lock(&fs_info->balance_lock);
3135         fs_info->balance_ctl = NULL;
3136         spin_unlock(&fs_info->balance_lock);
3137
3138         kfree(bctl);
3139         ret = del_balance_item(fs_info);
3140         if (ret)
3141                 btrfs_handle_fs_error(fs_info, ret, NULL);
3142 }
3143
3144 /*
3145  * Balance filters.  Return 1 if chunk should be filtered out
3146  * (should not be balanced).
3147  */
3148 static int chunk_profiles_filter(u64 chunk_type,
3149                                  struct btrfs_balance_args *bargs)
3150 {
3151         chunk_type = chunk_to_extended(chunk_type) &
3152                                 BTRFS_EXTENDED_PROFILE_MASK;
3153
3154         if (bargs->profiles & chunk_type)
3155                 return 0;
3156
3157         return 1;
3158 }
3159
3160 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3161                               struct btrfs_balance_args *bargs)
3162 {
3163         struct btrfs_block_group_cache *cache;
3164         u64 chunk_used;
3165         u64 user_thresh_min;
3166         u64 user_thresh_max;
3167         int ret = 1;
3168
3169         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3170         chunk_used = btrfs_block_group_used(&cache->item);
3171
3172         if (bargs->usage_min == 0)
3173                 user_thresh_min = 0;
3174         else
3175                 user_thresh_min = div_factor_fine(cache->key.offset,
3176                                         bargs->usage_min);
3177
3178         if (bargs->usage_max == 0)
3179                 user_thresh_max = 1;
3180         else if (bargs->usage_max > 100)
3181                 user_thresh_max = cache->key.offset;
3182         else
3183                 user_thresh_max = div_factor_fine(cache->key.offset,
3184                                         bargs->usage_max);
3185
3186         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3187                 ret = 0;
3188
3189         btrfs_put_block_group(cache);
3190         return ret;
3191 }
3192
3193 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3194                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3195 {
3196         struct btrfs_block_group_cache *cache;
3197         u64 chunk_used, user_thresh;
3198         int ret = 1;
3199
3200         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3201         chunk_used = btrfs_block_group_used(&cache->item);
3202
3203         if (bargs->usage_min == 0)
3204                 user_thresh = 1;
3205         else if (bargs->usage > 100)
3206                 user_thresh = cache->key.offset;
3207         else
3208                 user_thresh = div_factor_fine(cache->key.offset,
3209                                               bargs->usage);
3210
3211         if (chunk_used < user_thresh)
3212                 ret = 0;
3213
3214         btrfs_put_block_group(cache);
3215         return ret;
3216 }
3217
3218 static int chunk_devid_filter(struct extent_buffer *leaf,
3219                               struct btrfs_chunk *chunk,
3220                               struct btrfs_balance_args *bargs)
3221 {
3222         struct btrfs_stripe *stripe;
3223         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3224         int i;
3225
3226         for (i = 0; i < num_stripes; i++) {
3227                 stripe = btrfs_stripe_nr(chunk, i);
3228                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3229                         return 0;
3230         }
3231
3232         return 1;
3233 }
3234
3235 /* [pstart, pend) */
3236 static int chunk_drange_filter(struct extent_buffer *leaf,
3237                                struct btrfs_chunk *chunk,
3238                                struct btrfs_balance_args *bargs)
3239 {
3240         struct btrfs_stripe *stripe;
3241         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3242         u64 stripe_offset;
3243         u64 stripe_length;
3244         int factor;
3245         int i;
3246
3247         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3248                 return 0;
3249
3250         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3251              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3252                 factor = num_stripes / 2;
3253         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3254                 factor = num_stripes - 1;
3255         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3256                 factor = num_stripes - 2;
3257         } else {
3258                 factor = num_stripes;
3259         }
3260
3261         for (i = 0; i < num_stripes; i++) {
3262                 stripe = btrfs_stripe_nr(chunk, i);
3263                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3264                         continue;
3265
3266                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3267                 stripe_length = btrfs_chunk_length(leaf, chunk);
3268                 stripe_length = div_u64(stripe_length, factor);
3269
3270                 if (stripe_offset < bargs->pend &&
3271                     stripe_offset + stripe_length > bargs->pstart)
3272                         return 0;
3273         }
3274
3275         return 1;
3276 }
3277
3278 /* [vstart, vend) */
3279 static int chunk_vrange_filter(struct extent_buffer *leaf,
3280                                struct btrfs_chunk *chunk,
3281                                u64 chunk_offset,
3282                                struct btrfs_balance_args *bargs)
3283 {
3284         if (chunk_offset < bargs->vend &&
3285             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3286                 /* at least part of the chunk is inside this vrange */
3287                 return 0;
3288
3289         return 1;
3290 }
3291
3292 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3293                                struct btrfs_chunk *chunk,
3294                                struct btrfs_balance_args *bargs)
3295 {
3296         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3297
3298         if (bargs->stripes_min <= num_stripes
3299                         && num_stripes <= bargs->stripes_max)
3300                 return 0;
3301
3302         return 1;
3303 }
3304
3305 static int chunk_soft_convert_filter(u64 chunk_type,
3306                                      struct btrfs_balance_args *bargs)
3307 {
3308         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3309                 return 0;
3310
3311         chunk_type = chunk_to_extended(chunk_type) &
3312                                 BTRFS_EXTENDED_PROFILE_MASK;
3313
3314         if (bargs->target == chunk_type)
3315                 return 1;
3316
3317         return 0;
3318 }
3319
3320 static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3321                                 struct extent_buffer *leaf,
3322                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3323 {
3324         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3325         struct btrfs_balance_args *bargs = NULL;
3326         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3327
3328         /* type filter */
3329         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3330               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3331                 return 0;
3332         }
3333
3334         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3335                 bargs = &bctl->data;
3336         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3337                 bargs = &bctl->sys;
3338         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3339                 bargs = &bctl->meta;
3340
3341         /* profiles filter */
3342         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3343             chunk_profiles_filter(chunk_type, bargs)) {
3344                 return 0;
3345         }
3346
3347         /* usage filter */
3348         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3349             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3350                 return 0;
3351         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3352             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3353                 return 0;
3354         }
3355
3356         /* devid filter */
3357         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3358             chunk_devid_filter(leaf, chunk, bargs)) {
3359                 return 0;
3360         }
3361
3362         /* drange filter, makes sense only with devid filter */
3363         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3364             chunk_drange_filter(leaf, chunk, bargs)) {
3365                 return 0;
3366         }
3367
3368         /* vrange filter */
3369         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3370             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3371                 return 0;
3372         }
3373
3374         /* stripes filter */
3375         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3376             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3377                 return 0;
3378         }
3379
3380         /* soft profile changing mode */
3381         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3382             chunk_soft_convert_filter(chunk_type, bargs)) {
3383                 return 0;
3384         }
3385
3386         /*
3387          * limited by count, must be the last filter
3388          */
3389         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3390                 if (bargs->limit == 0)
3391                         return 0;
3392                 else
3393                         bargs->limit--;
3394         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3395                 /*
3396                  * Same logic as the 'limit' filter; the minimum cannot be
3397                  * determined here because we do not have the global information
3398                  * about the count of all chunks that satisfy the filters.
3399                  */
3400                 if (bargs->limit_max == 0)
3401                         return 0;
3402                 else
3403                         bargs->limit_max--;
3404         }
3405
3406         return 1;
3407 }
3408
3409 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3410 {
3411         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3412         struct btrfs_root *chunk_root = fs_info->chunk_root;
3413         struct btrfs_root *dev_root = fs_info->dev_root;
3414         struct list_head *devices;
3415         struct btrfs_device *device;
3416         u64 old_size;
3417         u64 size_to_free;
3418         u64 chunk_type;
3419         struct btrfs_chunk *chunk;
3420         struct btrfs_path *path = NULL;
3421         struct btrfs_key key;
3422         struct btrfs_key found_key;
3423         struct btrfs_trans_handle *trans;
3424         struct extent_buffer *leaf;
3425         int slot;
3426         int ret;
3427         int enospc_errors = 0;
3428         bool counting = true;
3429         /* The single value limit and min/max limits use the same bytes in the */
3430         u64 limit_data = bctl->data.limit;
3431         u64 limit_meta = bctl->meta.limit;
3432         u64 limit_sys = bctl->sys.limit;
3433         u32 count_data = 0;
3434         u32 count_meta = 0;
3435         u32 count_sys = 0;
3436         int chunk_reserved = 0;
3437
3438         /* step one make some room on all the devices */
3439         devices = &fs_info->fs_devices->devices;
3440         list_for_each_entry(device, devices, dev_list) {
3441                 old_size = btrfs_device_get_total_bytes(device);
3442                 size_to_free = div_factor(old_size, 1);
3443                 size_to_free = min_t(u64, size_to_free, SZ_1M);
3444                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3445                     btrfs_device_get_total_bytes(device) -
3446                     btrfs_device_get_bytes_used(device) > size_to_free ||
3447                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3448                         continue;
3449
3450                 ret = btrfs_shrink_device(device, old_size - size_to_free);
3451                 if (ret == -ENOSPC)
3452                         break;
3453                 if (ret) {
3454                         /* btrfs_shrink_device never returns ret > 0 */
3455                         WARN_ON(ret > 0);
3456                         goto error;
3457                 }
3458
3459                 trans = btrfs_start_transaction(dev_root, 0);
3460                 if (IS_ERR(trans)) {
3461                         ret = PTR_ERR(trans);
3462                         btrfs_info_in_rcu(fs_info,
3463                  "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3464                                           rcu_str_deref(device->name), ret,
3465                                           old_size, old_size - size_to_free);
3466                         goto error;
3467                 }
3468
3469                 ret = btrfs_grow_device(trans, device, old_size);
3470                 if (ret) {
3471                         btrfs_end_transaction(trans);
3472                         /* btrfs_grow_device never returns ret > 0 */
3473                         WARN_ON(ret > 0);
3474                         btrfs_info_in_rcu(fs_info,
3475                  "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3476                                           rcu_str_deref(device->name), ret,
3477                                           old_size, old_size - size_to_free);
3478                         goto error;
3479                 }
3480
3481                 btrfs_end_transaction(trans);
3482         }
3483
3484         /* step two, relocate all the chunks */
3485         path = btrfs_alloc_path();
3486         if (!path) {
3487                 ret = -ENOMEM;
3488                 goto error;
3489         }
3490
3491         /* zero out stat counters */
3492         spin_lock(&fs_info->balance_lock);
3493         memset(&bctl->stat, 0, sizeof(bctl->stat));
3494         spin_unlock(&fs_info->balance_lock);
3495 again:
3496         if (!counting) {
3497                 /*
3498                  * The single value limit and min/max limits use the same bytes
3499                  * in the
3500                  */
3501                 bctl->data.limit = limit_data;
3502                 bctl->meta.limit = limit_meta;
3503                 bctl->sys.limit = limit_sys;
3504         }
3505         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3506         key.offset = (u64)-1;
3507         key.type = BTRFS_CHUNK_ITEM_KEY;
3508
3509         while (1) {
3510                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3511                     atomic_read(&fs_info->balance_cancel_req)) {
3512                         ret = -ECANCELED;
3513                         goto error;
3514                 }
3515
3516                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3517                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3518                 if (ret < 0) {
3519                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3520                         goto error;
3521                 }
3522
3523                 /*
3524                  * this shouldn't happen, it means the last relocate
3525                  * failed
3526                  */
3527                 if (ret == 0)
3528                         BUG(); /* FIXME break ? */
3529
3530                 ret = btrfs_previous_item(chunk_root, path, 0,
3531                                           BTRFS_CHUNK_ITEM_KEY);
3532                 if (ret) {
3533                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3534                         ret = 0;
3535                         break;
3536                 }
3537
3538                 leaf = path->nodes[0];
3539                 slot = path->slots[0];
3540                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3541
3542                 if (found_key.objectid != key.objectid) {
3543                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3544                         break;
3545                 }
3546
3547                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3548                 chunk_type = btrfs_chunk_type(leaf, chunk);
3549
3550                 if (!counting) {
3551                         spin_lock(&fs_info->balance_lock);
3552                         bctl->stat.considered++;
3553                         spin_unlock(&fs_info->balance_lock);
3554                 }
3555
3556                 ret = should_balance_chunk(fs_info, leaf, chunk,
3557                                            found_key.offset);
3558
3559                 btrfs_release_path(path);
3560                 if (!ret) {
3561                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3562                         goto loop;
3563                 }
3564
3565                 if (counting) {
3566                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3567                         spin_lock(&fs_info->balance_lock);
3568                         bctl->stat.expected++;
3569                         spin_unlock(&fs_info->balance_lock);
3570
3571                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3572                                 count_data++;
3573                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3574                                 count_sys++;
3575                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3576                                 count_meta++;
3577
3578                         goto loop;
3579                 }
3580
3581                 /*
3582                  * Apply limit_min filter, no need to check if the LIMITS
3583                  * filter is used, limit_min is 0 by default
3584                  */
3585                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3586                                         count_data < bctl->data.limit_min)
3587                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3588                                         count_meta < bctl->meta.limit_min)
3589                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3590                                         count_sys < bctl->sys.limit_min)) {
3591                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3592                         goto loop;
3593                 }
3594
3595                 if (!chunk_reserved) {
3596                         /*
3597                          * We may be relocating the only data chunk we have,
3598                          * which could potentially end up with losing data's
3599                          * raid profile, so lets allocate an empty one in
3600                          * advance.
3601                          */
3602                         ret = btrfs_may_alloc_data_chunk(fs_info,
3603                                                          found_key.offset);
3604                         if (ret < 0) {
3605                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3606                                 goto error;
3607                         } else if (ret == 1) {
3608                                 chunk_reserved = 1;
3609                         }
3610                 }
3611
3612                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3613                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3614                 if (ret && ret != -ENOSPC)
3615                         goto error;
3616                 if (ret == -ENOSPC) {
3617                         enospc_errors++;
3618                 } else {
3619                         spin_lock(&fs_info->balance_lock);
3620                         bctl->stat.completed++;
3621                         spin_unlock(&fs_info->balance_lock);
3622                 }
3623 loop:
3624                 if (found_key.offset == 0)
3625                         break;
3626                 key.offset = found_key.offset - 1;
3627         }
3628
3629         if (counting) {
3630                 btrfs_release_path(path);
3631                 counting = false;
3632                 goto again;
3633         }
3634 error:
3635         btrfs_free_path(path);
3636         if (enospc_errors) {
3637                 btrfs_info(fs_info, "%d enospc errors during balance",
3638                            enospc_errors);
3639                 if (!ret)
3640                         ret = -ENOSPC;
3641         }
3642
3643         return ret;
3644 }
3645
3646 /**
3647  * alloc_profile_is_valid - see if a given profile is valid and reduced
3648  * @flags: profile to validate
3649  * @extended: if true @flags is treated as an extended profile
3650  */
3651 static int alloc_profile_is_valid(u64 flags, int extended)
3652 {
3653         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3654                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3655
3656         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3657
3658         /* 1) check that all other bits are zeroed */
3659         if (flags & ~mask)
3660                 return 0;
3661
3662         /* 2) see if profile is reduced */
3663         if (flags == 0)
3664                 return !extended; /* "0" is valid for usual profiles */
3665
3666         /* true if exactly one bit set */
3667         return (flags & (flags - 1)) == 0;
3668 }
3669
3670 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3671 {
3672         /* cancel requested || normal exit path */
3673         return atomic_read(&fs_info->balance_cancel_req) ||
3674                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3675                  atomic_read(&fs_info->balance_cancel_req) == 0);
3676 }
3677
3678 /* Non-zero return value signifies invalidity */
3679 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3680                 u64 allowed)
3681 {
3682         return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3683                 (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3684                  (bctl_arg->target & ~allowed)));
3685 }
3686
3687 /*
3688  * Should be called with balance mutexe held
3689  */
3690 int btrfs_balance(struct btrfs_fs_info *fs_info,
3691                   struct btrfs_balance_control *bctl,
3692                   struct btrfs_ioctl_balance_args *bargs)
3693 {
3694         u64 meta_target, data_target;
3695         u64 allowed;
3696         int mixed = 0;
3697         int ret;
3698         u64 num_devices;
3699         unsigned seq;
3700
3701         if (btrfs_fs_closing(fs_info) ||
3702             atomic_read(&fs_info->balance_pause_req) ||
3703             atomic_read(&fs_info->balance_cancel_req)) {
3704                 ret = -EINVAL;
3705                 goto out;
3706         }
3707
3708         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3709         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3710                 mixed = 1;
3711
3712         /*
3713          * In case of mixed groups both data and meta should be picked,
3714          * and identical options should be given for both of them.
3715          */
3716         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3717         if (mixed && (bctl->flags & allowed)) {
3718                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3719                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3720                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3721                         btrfs_err(fs_info,
3722           "balance: mixed groups data and metadata options must be the same");
3723                         ret = -EINVAL;
3724                         goto out;
3725                 }
3726         }
3727
3728         num_devices = fs_info->fs_devices->num_devices;
3729         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3730         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3731                 BUG_ON(num_devices < 1);
3732                 num_devices--;
3733         }
3734         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3735         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
3736         if (num_devices > 1)
3737                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3738         if (num_devices > 2)
3739                 allowed |= BTRFS_BLOCK_GROUP_RAID5;
3740         if (num_devices > 3)
3741                 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3742                             BTRFS_BLOCK_GROUP_RAID6);