be3fc701f38948e37e5d776ee93413e0d06e85ff
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/iocontext.h>
12 #include <linux/capability.h>
13 #include <linux/ratelimit.h>
14 #include <linux/kthread.h>
15 #include <linux/raid/pq.h>
16 #include <linux/semaphore.h>
17 #include <linux/uuid.h>
18 #include <linux/list_sort.h>
19 #include <asm/div64.h>
20 #include "ctree.h"
21 #include "extent_map.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "volumes.h"
26 #include "raid56.h"
27 #include "async-thread.h"
28 #include "check-integrity.h"
29 #include "rcu-string.h"
30 #include "math.h"
31 #include "dev-replace.h"
32 #include "sysfs.h"
33
34 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
35         [BTRFS_RAID_RAID10] = {
36                 .sub_stripes    = 2,
37                 .dev_stripes    = 1,
38                 .devs_max       = 0,    /* 0 == as many as possible */
39                 .devs_min       = 4,
40                 .tolerated_failures = 1,
41                 .devs_increment = 2,
42                 .ncopies        = 2,
43         },
44         [BTRFS_RAID_RAID1] = {
45                 .sub_stripes    = 1,
46                 .dev_stripes    = 1,
47                 .devs_max       = 2,
48                 .devs_min       = 2,
49                 .tolerated_failures = 1,
50                 .devs_increment = 2,
51                 .ncopies        = 2,
52         },
53         [BTRFS_RAID_DUP] = {
54                 .sub_stripes    = 1,
55                 .dev_stripes    = 2,
56                 .devs_max       = 1,
57                 .devs_min       = 1,
58                 .tolerated_failures = 0,
59                 .devs_increment = 1,
60                 .ncopies        = 2,
61         },
62         [BTRFS_RAID_RAID0] = {
63                 .sub_stripes    = 1,
64                 .dev_stripes    = 1,
65                 .devs_max       = 0,
66                 .devs_min       = 2,
67                 .tolerated_failures = 0,
68                 .devs_increment = 1,
69                 .ncopies        = 1,
70         },
71         [BTRFS_RAID_SINGLE] = {
72                 .sub_stripes    = 1,
73                 .dev_stripes    = 1,
74                 .devs_max       = 1,
75                 .devs_min       = 1,
76                 .tolerated_failures = 0,
77                 .devs_increment = 1,
78                 .ncopies        = 1,
79         },
80         [BTRFS_RAID_RAID5] = {
81                 .sub_stripes    = 1,
82                 .dev_stripes    = 1,
83                 .devs_max       = 0,
84                 .devs_min       = 2,
85                 .tolerated_failures = 1,
86                 .devs_increment = 1,
87                 .ncopies        = 2,
88         },
89         [BTRFS_RAID_RAID6] = {
90                 .sub_stripes    = 1,
91                 .dev_stripes    = 1,
92                 .devs_max       = 0,
93                 .devs_min       = 3,
94                 .tolerated_failures = 2,
95                 .devs_increment = 1,
96                 .ncopies        = 3,
97         },
98 };
99
100 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
101         [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
102         [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
103         [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
104         [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
105         [BTRFS_RAID_SINGLE] = 0,
106         [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
107         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
108 };
109
110 /*
111  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
112  * condition is not met. Zero means there's no corresponding
113  * BTRFS_ERROR_DEV_*_NOT_MET value.
114  */
115 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
116         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
117         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
118         [BTRFS_RAID_DUP]    = 0,
119         [BTRFS_RAID_RAID0]  = 0,
120         [BTRFS_RAID_SINGLE] = 0,
121         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
122         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
123 };
124
125 static int init_first_rw_device(struct btrfs_trans_handle *trans,
126                                 struct btrfs_fs_info *fs_info);
127 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
128 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
129 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
130 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
131 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
132                              enum btrfs_map_op op,
133                              u64 logical, u64 *length,
134                              struct btrfs_bio **bbio_ret,
135                              int mirror_num, int need_raid_map);
136
137 /*
138  * Device locking
139  * ==============
140  *
141  * There are several mutexes that protect manipulation of devices and low-level
142  * structures like chunks but not block groups, extents or files
143  *
144  * uuid_mutex (global lock)
145  * ------------------------
146  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
147  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
148  * device) or requested by the device= mount option
149  *
150  * the mutex can be very coarse and can cover long-running operations
151  *
152  * protects: updates to fs_devices counters like missing devices, rw devices,
153  * seeding, structure cloning, openning/closing devices at mount/umount time
154  *
155  * global::fs_devs - add, remove, updates to the global list
156  *
157  * does not protect: manipulation of the fs_devices::devices list!
158  *
159  * btrfs_device::name - renames (write side), read is RCU
160  *
161  * fs_devices::device_list_mutex (per-fs, with RCU)
162  * ------------------------------------------------
163  * protects updates to fs_devices::devices, ie. adding and deleting
164  *
165  * simple list traversal with read-only actions can be done with RCU protection
166  *
167  * may be used to exclude some operations from running concurrently without any
168  * modifications to the list (see write_all_supers)
169  *
170  * volume_mutex
171  * ------------
172  * coarse lock owned by a mounted filesystem; used to exclude some operations
173  * that cannot run in parallel and affect the higher-level properties of the
174  * filesystem like: device add/deleting/resize/replace, or balance
175  *
176  * balance_mutex
177  * -------------
178  * protects balance structures (status, state) and context accessed from
179  * several places (internally, ioctl)
180  *
181  * chunk_mutex
182  * -----------
183  * protects chunks, adding or removing during allocation, trim or when a new
184  * device is added/removed
185  *
186  * cleaner_mutex
187  * -------------
188  * a big lock that is held by the cleaner thread and prevents running subvolume
189  * cleaning together with relocation or delayed iputs
190  *
191  *
192  * Lock nesting
193  * ============
194  *
195  * uuid_mutex
196  *   volume_mutex
197  *     device_list_mutex
198  *       chunk_mutex
199  *     balance_mutex
200  */
201
202 DEFINE_MUTEX(uuid_mutex);
203 static LIST_HEAD(fs_uuids);
204 struct list_head *btrfs_get_fs_uuids(void)
205 {
206         return &fs_uuids;
207 }
208
209 /*
210  * alloc_fs_devices - allocate struct btrfs_fs_devices
211  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
212  *
213  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
214  * The returned struct is not linked onto any lists and can be destroyed with
215  * kfree() right away.
216  */
217 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
218 {
219         struct btrfs_fs_devices *fs_devs;
220
221         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
222         if (!fs_devs)
223                 return ERR_PTR(-ENOMEM);
224
225         mutex_init(&fs_devs->device_list_mutex);
226
227         INIT_LIST_HEAD(&fs_devs->devices);
228         INIT_LIST_HEAD(&fs_devs->resized_devices);
229         INIT_LIST_HEAD(&fs_devs->alloc_list);
230         INIT_LIST_HEAD(&fs_devs->list);
231         if (fsid)
232                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
233
234         return fs_devs;
235 }
236
237 static void free_device(struct btrfs_device *device)
238 {
239         rcu_string_free(device->name);
240         bio_put(device->flush_bio);
241         kfree(device);
242 }
243
244 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
245 {
246         struct btrfs_device *device;
247         WARN_ON(fs_devices->opened);
248         while (!list_empty(&fs_devices->devices)) {
249                 device = list_entry(fs_devices->devices.next,
250                                     struct btrfs_device, dev_list);
251                 list_del(&device->dev_list);
252                 free_device(device);
253         }
254         kfree(fs_devices);
255 }
256
257 static void btrfs_kobject_uevent(struct block_device *bdev,
258                                  enum kobject_action action)
259 {
260         int ret;
261
262         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
263         if (ret)
264                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
265                         action,
266                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
267                         &disk_to_dev(bdev->bd_disk)->kobj);
268 }
269
270 void __exit btrfs_cleanup_fs_uuids(void)
271 {
272         struct btrfs_fs_devices *fs_devices;
273
274         while (!list_empty(&fs_uuids)) {
275                 fs_devices = list_entry(fs_uuids.next,
276                                         struct btrfs_fs_devices, list);
277                 list_del(&fs_devices->list);
278                 free_fs_devices(fs_devices);
279         }
280 }
281
282 /*
283  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
284  * Returned struct is not linked onto any lists and must be destroyed using
285  * free_device.
286  */
287 static struct btrfs_device *__alloc_device(void)
288 {
289         struct btrfs_device *dev;
290
291         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
292         if (!dev)
293                 return ERR_PTR(-ENOMEM);
294
295         /*
296          * Preallocate a bio that's always going to be used for flushing device
297          * barriers and matches the device lifespan
298          */
299         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
300         if (!dev->flush_bio) {
301                 kfree(dev);
302                 return ERR_PTR(-ENOMEM);
303         }
304
305         INIT_LIST_HEAD(&dev->dev_list);
306         INIT_LIST_HEAD(&dev->dev_alloc_list);
307         INIT_LIST_HEAD(&dev->resized_list);
308
309         spin_lock_init(&dev->io_lock);
310
311         atomic_set(&dev->reada_in_flight, 0);
312         atomic_set(&dev->dev_stats_ccnt, 0);
313         btrfs_device_data_ordered_init(dev);
314         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
315         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
316
317         return dev;
318 }
319
320 /*
321  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
322  * return NULL.
323  *
324  * If devid and uuid are both specified, the match must be exact, otherwise
325  * only devid is used.
326  */
327 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
328                 u64 devid, const u8 *uuid)
329 {
330         struct list_head *head = &fs_devices->devices;
331         struct btrfs_device *dev;
332
333         list_for_each_entry(dev, head, dev_list) {
334                 if (dev->devid == devid &&
335                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
336                         return dev;
337                 }
338         }
339         return NULL;
340 }
341
342 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
343 {
344         struct btrfs_fs_devices *fs_devices;
345
346         list_for_each_entry(fs_devices, &fs_uuids, list) {
347                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
348                         return fs_devices;
349         }
350         return NULL;
351 }
352
353 static int
354 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
355                       int flush, struct block_device **bdev,
356                       struct buffer_head **bh)
357 {
358         int ret;
359
360         *bdev = blkdev_get_by_path(device_path, flags, holder);
361
362         if (IS_ERR(*bdev)) {
363                 ret = PTR_ERR(*bdev);
364                 goto error;
365         }
366
367         if (flush)
368                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
369         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
370         if (ret) {
371                 blkdev_put(*bdev, flags);
372                 goto error;
373         }
374         invalidate_bdev(*bdev);
375         *bh = btrfs_read_dev_super(*bdev);
376         if (IS_ERR(*bh)) {
377                 ret = PTR_ERR(*bh);
378                 blkdev_put(*bdev, flags);
379                 goto error;
380         }
381
382         return 0;
383
384 error:
385         *bdev = NULL;
386         *bh = NULL;
387         return ret;
388 }
389
390 static void requeue_list(struct btrfs_pending_bios *pending_bios,
391                         struct bio *head, struct bio *tail)
392 {
393
394         struct bio *old_head;
395
396         old_head = pending_bios->head;
397         pending_bios->head = head;
398         if (pending_bios->tail)
399                 tail->bi_next = old_head;
400         else
401                 pending_bios->tail = tail;
402 }
403
404 /*
405  * we try to collect pending bios for a device so we don't get a large
406  * number of procs sending bios down to the same device.  This greatly
407  * improves the schedulers ability to collect and merge the bios.
408  *
409  * But, it also turns into a long list of bios to process and that is sure
410  * to eventually make the worker thread block.  The solution here is to
411  * make some progress and then put this work struct back at the end of
412  * the list if the block device is congested.  This way, multiple devices
413  * can make progress from a single worker thread.
414  */
415 static noinline void run_scheduled_bios(struct btrfs_device *device)
416 {
417         struct btrfs_fs_info *fs_info = device->fs_info;
418         struct bio *pending;
419         struct backing_dev_info *bdi;
420         struct btrfs_pending_bios *pending_bios;
421         struct bio *tail;
422         struct bio *cur;
423         int again = 0;
424         unsigned long num_run;
425         unsigned long batch_run = 0;
426         unsigned long last_waited = 0;
427         int force_reg = 0;
428         int sync_pending = 0;
429         struct blk_plug plug;
430
431         /*
432          * this function runs all the bios we've collected for
433          * a particular device.  We don't want to wander off to
434          * another device without first sending all of these down.
435          * So, setup a plug here and finish it off before we return
436          */
437         blk_start_plug(&plug);
438
439         bdi = device->bdev->bd_bdi;
440
441 loop:
442         spin_lock(&device->io_lock);
443
444 loop_lock:
445         num_run = 0;
446
447         /* take all the bios off the list at once and process them
448          * later on (without the lock held).  But, remember the
449          * tail and other pointers so the bios can be properly reinserted
450          * into the list if we hit congestion
451          */
452         if (!force_reg && device->pending_sync_bios.head) {
453                 pending_bios = &device->pending_sync_bios;
454                 force_reg = 1;
455         } else {
456                 pending_bios = &device->pending_bios;
457                 force_reg = 0;
458         }
459
460         pending = pending_bios->head;
461         tail = pending_bios->tail;
462         WARN_ON(pending && !tail);
463
464         /*
465          * if pending was null this time around, no bios need processing
466          * at all and we can stop.  Otherwise it'll loop back up again
467          * and do an additional check so no bios are missed.
468          *
469          * device->running_pending is used to synchronize with the
470          * schedule_bio code.
471          */
472         if (device->pending_sync_bios.head == NULL &&
473             device->pending_bios.head == NULL) {
474                 again = 0;
475                 device->running_pending = 0;
476         } else {
477                 again = 1;
478                 device->running_pending = 1;
479         }
480
481         pending_bios->head = NULL;
482         pending_bios->tail = NULL;
483
484         spin_unlock(&device->io_lock);
485
486         while (pending) {
487
488                 rmb();
489                 /* we want to work on both lists, but do more bios on the
490                  * sync list than the regular list
491                  */
492                 if ((num_run > 32 &&
493                     pending_bios != &device->pending_sync_bios &&
494                     device->pending_sync_bios.head) ||
495                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
496                     device->pending_bios.head)) {
497                         spin_lock(&device->io_lock);
498                         requeue_list(pending_bios, pending, tail);
499                         goto loop_lock;
500                 }
501
502                 cur = pending;
503                 pending = pending->bi_next;
504                 cur->bi_next = NULL;
505
506                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
507
508                 /*
509                  * if we're doing the sync list, record that our
510                  * plug has some sync requests on it
511                  *
512                  * If we're doing the regular list and there are
513                  * sync requests sitting around, unplug before
514                  * we add more
515                  */
516                 if (pending_bios == &device->pending_sync_bios) {
517                         sync_pending = 1;
518                 } else if (sync_pending) {
519                         blk_finish_plug(&plug);
520                         blk_start_plug(&plug);
521                         sync_pending = 0;
522                 }
523
524                 btrfsic_submit_bio(cur);
525                 num_run++;
526                 batch_run++;
527
528                 cond_resched();
529
530                 /*
531                  * we made progress, there is more work to do and the bdi
532                  * is now congested.  Back off and let other work structs
533                  * run instead
534                  */
535                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
536                     fs_info->fs_devices->open_devices > 1) {
537                         struct io_context *ioc;
538
539                         ioc = current->io_context;
540
541                         /*
542                          * the main goal here is that we don't want to
543                          * block if we're going to be able to submit
544                          * more requests without blocking.
545                          *
546                          * This code does two great things, it pokes into
547                          * the elevator code from a filesystem _and_
548                          * it makes assumptions about how batching works.
549                          */
550                         if (ioc && ioc->nr_batch_requests > 0 &&
551                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
552                             (last_waited == 0 ||
553                              ioc->last_waited == last_waited)) {
554                                 /*
555                                  * we want to go through our batch of
556                                  * requests and stop.  So, we copy out
557                                  * the ioc->last_waited time and test
558                                  * against it before looping
559                                  */
560                                 last_waited = ioc->last_waited;
561                                 cond_resched();
562                                 continue;
563                         }
564                         spin_lock(&device->io_lock);
565                         requeue_list(pending_bios, pending, tail);
566                         device->running_pending = 1;
567
568                         spin_unlock(&device->io_lock);
569                         btrfs_queue_work(fs_info->submit_workers,
570                                          &device->work);
571                         goto done;
572                 }
573         }
574
575         cond_resched();
576         if (again)
577                 goto loop;
578
579         spin_lock(&device->io_lock);
580         if (device->pending_bios.head || device->pending_sync_bios.head)
581                 goto loop_lock;
582         spin_unlock(&device->io_lock);
583
584 done:
585         blk_finish_plug(&plug);
586 }
587
588 static void pending_bios_fn(struct btrfs_work *work)
589 {
590         struct btrfs_device *device;
591
592         device = container_of(work, struct btrfs_device, work);
593         run_scheduled_bios(device);
594 }
595
596 /*
597  *  Search and remove all stale (devices which are not mounted) devices.
598  *  When both inputs are NULL, it will search and release all stale devices.
599  *  path:       Optional. When provided will it release all unmounted devices
600  *              matching this path only.
601  *  skip_dev:   Optional. Will skip this device when searching for the stale
602  *              devices.
603  */
604 static void btrfs_free_stale_devices(const char *path,
605                                      struct btrfs_device *skip_dev)
606 {
607         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
608         struct btrfs_device *dev, *tmp_dev;
609
610         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
611
612                 if (fs_devs->opened)
613                         continue;
614
615                 list_for_each_entry_safe(dev, tmp_dev,
616                                          &fs_devs->devices, dev_list) {
617                         int not_found = 0;
618
619                         if (skip_dev && skip_dev == dev)
620                                 continue;
621                         if (path && !dev->name)
622                                 continue;
623
624                         rcu_read_lock();
625                         if (path)
626                                 not_found = strcmp(rcu_str_deref(dev->name),
627                                                    path);
628                         rcu_read_unlock();
629                         if (not_found)
630                                 continue;
631
632                         /* delete the stale device */
633                         if (fs_devs->num_devices == 1) {
634                                 btrfs_sysfs_remove_fsid(fs_devs);
635                                 list_del(&fs_devs->list);
636                                 free_fs_devices(fs_devs);
637                                 break;
638                         } else {
639                                 fs_devs->num_devices--;
640                                 list_del(&dev->dev_list);
641                                 free_device(dev);
642                         }
643                 }
644         }
645 }
646
647 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
648                         struct btrfs_device *device, fmode_t flags,
649                         void *holder)
650 {
651         struct request_queue *q;
652         struct block_device *bdev;
653         struct buffer_head *bh;
654         struct btrfs_super_block *disk_super;
655         u64 devid;
656         int ret;
657
658         if (device->bdev)
659                 return -EINVAL;
660         if (!device->name)
661                 return -EINVAL;
662
663         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
664                                     &bdev, &bh);
665         if (ret)
666                 return ret;
667
668         disk_super = (struct btrfs_super_block *)bh->b_data;
669         devid = btrfs_stack_device_id(&disk_super->dev_item);
670         if (devid != device->devid)
671                 goto error_brelse;
672
673         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
674                 goto error_brelse;
675
676         device->generation = btrfs_super_generation(disk_super);
677
678         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
679                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
680                 fs_devices->seeding = 1;
681         } else {
682                 if (bdev_read_only(bdev))
683                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
684                 else
685                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
686         }
687
688         q = bdev_get_queue(bdev);
689         if (!blk_queue_nonrot(q))
690                 fs_devices->rotating = 1;
691
692         device->bdev = bdev;
693         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
694         device->mode = flags;
695
696         fs_devices->open_devices++;
697         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
698             device->devid != BTRFS_DEV_REPLACE_DEVID) {
699                 fs_devices->rw_devices++;
700                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
701         }
702         brelse(bh);
703
704         return 0;
705
706 error_brelse:
707         brelse(bh);
708         blkdev_put(bdev, flags);
709
710         return -EINVAL;
711 }
712
713 /*
714  * Add new device to list of registered devices
715  *
716  * Returns:
717  * device pointer which was just added or updated when successful
718  * error pointer when failed
719  */
720 static noinline struct btrfs_device *device_list_add(const char *path,
721                            struct btrfs_super_block *disk_super)
722 {
723         struct btrfs_device *device;
724         struct btrfs_fs_devices *fs_devices;
725         struct rcu_string *name;
726         u64 found_transid = btrfs_super_generation(disk_super);
727         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
728
729         fs_devices = find_fsid(disk_super->fsid);
730         if (!fs_devices) {
731                 fs_devices = alloc_fs_devices(disk_super->fsid);
732                 if (IS_ERR(fs_devices))
733                         return ERR_CAST(fs_devices);
734
735                 list_add(&fs_devices->list, &fs_uuids);
736
737                 device = NULL;
738         } else {
739                 device = find_device(fs_devices, devid,
740                                 disk_super->dev_item.uuid);
741         }
742
743         if (!device) {
744                 if (fs_devices->opened)
745                         return ERR_PTR(-EBUSY);
746
747                 device = btrfs_alloc_device(NULL, &devid,
748                                             disk_super->dev_item.uuid);
749                 if (IS_ERR(device)) {
750                         /* we can safely leave the fs_devices entry around */
751                         return device;
752                 }
753
754                 name = rcu_string_strdup(path, GFP_NOFS);
755                 if (!name) {
756                         free_device(device);
757                         return ERR_PTR(-ENOMEM);
758                 }
759                 rcu_assign_pointer(device->name, name);
760
761                 mutex_lock(&fs_devices->device_list_mutex);
762                 list_add_rcu(&device->dev_list, &fs_devices->devices);
763                 fs_devices->num_devices++;
764                 mutex_unlock(&fs_devices->device_list_mutex);
765
766                 device->fs_devices = fs_devices;
767                 btrfs_free_stale_devices(path, device);
768
769                 if (disk_super->label[0])
770                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
771                                 disk_super->label, devid, found_transid, path);
772                 else
773                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
774                                 disk_super->fsid, devid, found_transid, path);
775
776         } else if (!device->name || strcmp(device->name->str, path)) {
777                 /*
778                  * When FS is already mounted.
779                  * 1. If you are here and if the device->name is NULL that
780                  *    means this device was missing at time of FS mount.
781                  * 2. If you are here and if the device->name is different
782                  *    from 'path' that means either
783                  *      a. The same device disappeared and reappeared with
784                  *         different name. or
785                  *      b. The missing-disk-which-was-replaced, has
786                  *         reappeared now.
787                  *
788                  * We must allow 1 and 2a above. But 2b would be a spurious
789                  * and unintentional.
790                  *
791                  * Further in case of 1 and 2a above, the disk at 'path'
792                  * would have missed some transaction when it was away and
793                  * in case of 2a the stale bdev has to be updated as well.
794                  * 2b must not be allowed at all time.
795                  */
796
797                 /*
798                  * For now, we do allow update to btrfs_fs_device through the
799                  * btrfs dev scan cli after FS has been mounted.  We're still
800                  * tracking a problem where systems fail mount by subvolume id
801                  * when we reject replacement on a mounted FS.
802                  */
803                 if (!fs_devices->opened && found_transid < device->generation) {
804                         /*
805                          * That is if the FS is _not_ mounted and if you
806                          * are here, that means there is more than one
807                          * disk with same uuid and devid.We keep the one
808                          * with larger generation number or the last-in if
809                          * generation are equal.
810                          */
811                         return ERR_PTR(-EEXIST);
812                 }
813
814                 name = rcu_string_strdup(path, GFP_NOFS);
815                 if (!name)
816                         return ERR_PTR(-ENOMEM);
817                 rcu_string_free(device->name);
818                 rcu_assign_pointer(device->name, name);
819                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
820                         fs_devices->missing_devices--;
821                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
822                 }
823         }
824
825         /*
826          * Unmount does not free the btrfs_device struct but would zero
827          * generation along with most of the other members. So just update
828          * it back. We need it to pick the disk with largest generation
829          * (as above).
830          */
831         if (!fs_devices->opened)
832                 device->generation = found_transid;
833
834         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
835
836         return device;
837 }
838
839 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
840 {
841         struct btrfs_fs_devices *fs_devices;
842         struct btrfs_device *device;
843         struct btrfs_device *orig_dev;
844
845         fs_devices = alloc_fs_devices(orig->fsid);
846         if (IS_ERR(fs_devices))
847                 return fs_devices;
848
849         mutex_lock(&orig->device_list_mutex);
850         fs_devices->total_devices = orig->total_devices;
851
852         /* We have held the volume lock, it is safe to get the devices. */
853         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
854                 struct rcu_string *name;
855
856                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
857                                             orig_dev->uuid);
858                 if (IS_ERR(device))
859                         goto error;
860
861                 /*
862                  * This is ok to do without rcu read locked because we hold the
863                  * uuid mutex so nothing we touch in here is going to disappear.
864                  */
865                 if (orig_dev->name) {
866                         name = rcu_string_strdup(orig_dev->name->str,
867                                         GFP_KERNEL);
868                         if (!name) {
869                                 free_device(device);
870                                 goto error;
871                         }
872                         rcu_assign_pointer(device->name, name);
873                 }
874
875                 list_add(&device->dev_list, &fs_devices->devices);
876                 device->fs_devices = fs_devices;
877                 fs_devices->num_devices++;
878         }
879         mutex_unlock(&orig->device_list_mutex);
880         return fs_devices;
881 error:
882         mutex_unlock(&orig->device_list_mutex);
883         free_fs_devices(fs_devices);
884         return ERR_PTR(-ENOMEM);
885 }
886
887 /*
888  * After we have read the system tree and know devids belonging to
889  * this filesystem, remove the device which does not belong there.
890  */
891 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
892 {
893         struct btrfs_device *device, *next;
894         struct btrfs_device *latest_dev = NULL;
895
896         mutex_lock(&uuid_mutex);
897 again:
898         /* This is the initialized path, it is safe to release the devices. */
899         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
900                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
901                                                         &device->dev_state)) {
902                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
903                              &device->dev_state) &&
904                              (!latest_dev ||
905                               device->generation > latest_dev->generation)) {
906                                 latest_dev = device;
907                         }
908                         continue;
909                 }
910
911                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
912                         /*
913                          * In the first step, keep the device which has
914                          * the correct fsid and the devid that is used
915                          * for the dev_replace procedure.
916                          * In the second step, the dev_replace state is
917                          * read from the device tree and it is known
918                          * whether the procedure is really active or
919                          * not, which means whether this device is
920                          * used or whether it should be removed.
921                          */
922                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
923                                                   &device->dev_state)) {
924                                 continue;
925                         }
926                 }
927                 if (device->bdev) {
928                         blkdev_put(device->bdev, device->mode);
929                         device->bdev = NULL;
930                         fs_devices->open_devices--;
931                 }
932                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
933                         list_del_init(&device->dev_alloc_list);
934                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
935                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
936                                       &device->dev_state))
937                                 fs_devices->rw_devices--;
938                 }
939                 list_del_init(&device->dev_list);
940                 fs_devices->num_devices--;
941                 free_device(device);
942         }
943
944         if (fs_devices->seed) {
945                 fs_devices = fs_devices->seed;
946                 goto again;
947         }
948
949         fs_devices->latest_bdev = latest_dev->bdev;
950
951         mutex_unlock(&uuid_mutex);
952 }
953
954 static void free_device_rcu(struct rcu_head *head)
955 {
956         struct btrfs_device *device;
957
958         device = container_of(head, struct btrfs_device, rcu);
959         free_device(device);
960 }
961
962 static void btrfs_close_bdev(struct btrfs_device *device)
963 {
964         if (!device->bdev)
965                 return;
966
967         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
968                 sync_blockdev(device->bdev);
969                 invalidate_bdev(device->bdev);
970         }
971
972         blkdev_put(device->bdev, device->mode);
973 }
974
975 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
976 {
977         struct btrfs_fs_devices *fs_devices = device->fs_devices;
978         struct btrfs_device *new_device;
979         struct rcu_string *name;
980
981         if (device->bdev)
982                 fs_devices->open_devices--;
983
984         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
985             device->devid != BTRFS_DEV_REPLACE_DEVID) {
986                 list_del_init(&device->dev_alloc_list);
987                 fs_devices->rw_devices--;
988         }
989
990         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
991                 fs_devices->missing_devices--;
992
993         new_device = btrfs_alloc_device(NULL, &device->devid,
994                                         device->uuid);
995         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
996
997         /* Safe because we are under uuid_mutex */
998         if (device->name) {
999                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1000                 BUG_ON(!name); /* -ENOMEM */
1001                 rcu_assign_pointer(new_device->name, name);
1002         }
1003
1004         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1005         new_device->fs_devices = device->fs_devices;
1006 }
1007
1008 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1009 {
1010         struct btrfs_device *device, *tmp;
1011         struct list_head pending_put;
1012
1013         INIT_LIST_HEAD(&pending_put);
1014
1015         if (--fs_devices->opened > 0)
1016                 return 0;
1017
1018         mutex_lock(&fs_devices->device_list_mutex);
1019         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1020                 btrfs_prepare_close_one_device(device);
1021                 list_add(&device->dev_list, &pending_put);
1022         }
1023         mutex_unlock(&fs_devices->device_list_mutex);
1024
1025         /*
1026          * btrfs_show_devname() is using the device_list_mutex,
1027          * sometimes call to blkdev_put() leads vfs calling
1028          * into this func. So do put outside of device_list_mutex,
1029          * as of now.
1030          */
1031         while (!list_empty(&pending_put)) {
1032                 device = list_first_entry(&pending_put,
1033                                 struct btrfs_device, dev_list);
1034                 list_del(&device->dev_list);
1035                 btrfs_close_bdev(device);
1036                 call_rcu(&device->rcu, free_device_rcu);
1037         }
1038
1039         WARN_ON(fs_devices->open_devices);
1040         WARN_ON(fs_devices->rw_devices);
1041         fs_devices->opened = 0;
1042         fs_devices->seeding = 0;
1043
1044         return 0;
1045 }
1046
1047 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1048 {
1049         struct btrfs_fs_devices *seed_devices = NULL;
1050         int ret;
1051
1052         mutex_lock(&uuid_mutex);
1053         ret = __btrfs_close_devices(fs_devices);
1054         if (!fs_devices->opened) {
1055                 seed_devices = fs_devices->seed;
1056                 fs_devices->seed = NULL;
1057         }
1058         mutex_unlock(&uuid_mutex);
1059
1060         while (seed_devices) {
1061                 fs_devices = seed_devices;
1062                 seed_devices = fs_devices->seed;
1063                 __btrfs_close_devices(fs_devices);
1064                 free_fs_devices(fs_devices);
1065         }
1066         return ret;
1067 }
1068
1069 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1070                                 fmode_t flags, void *holder)
1071 {
1072         struct list_head *head = &fs_devices->devices;
1073         struct btrfs_device *device;
1074         struct btrfs_device *latest_dev = NULL;
1075         int ret = 0;
1076
1077         flags |= FMODE_EXCL;
1078
1079         list_for_each_entry(device, head, dev_list) {
1080                 /* Just open everything we can; ignore failures here */
1081                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1082                         continue;
1083
1084                 if (!latest_dev ||
1085                     device->generation > latest_dev->generation)
1086                         latest_dev = device;
1087         }
1088         if (fs_devices->open_devices == 0) {
1089                 ret = -EINVAL;
1090                 goto out;
1091         }
1092         fs_devices->opened = 1;
1093         fs_devices->latest_bdev = latest_dev->bdev;
1094         fs_devices->total_rw_bytes = 0;
1095 out:
1096         return ret;
1097 }
1098
1099 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1100 {
1101         struct btrfs_device *dev1, *dev2;
1102
1103         dev1 = list_entry(a, struct btrfs_device, dev_list);
1104         dev2 = list_entry(b, struct btrfs_device, dev_list);
1105
1106         if (dev1->devid < dev2->devid)
1107                 return -1;
1108         else if (dev1->devid > dev2->devid)
1109                 return 1;
1110         return 0;
1111 }
1112
1113 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1114                        fmode_t flags, void *holder)
1115 {
1116         int ret;
1117
1118         mutex_lock(&uuid_mutex);
1119         if (fs_devices->opened) {
1120                 fs_devices->opened++;
1121                 ret = 0;
1122         } else {
1123                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1124                 ret = __btrfs_open_devices(fs_devices, flags, holder);
1125         }
1126         mutex_unlock(&uuid_mutex);
1127         return ret;
1128 }
1129
1130 static void btrfs_release_disk_super(struct page *page)
1131 {
1132         kunmap(page);
1133         put_page(page);
1134 }
1135
1136 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1137                                  struct page **page,
1138                                  struct btrfs_super_block **disk_super)
1139 {
1140         void *p;
1141         pgoff_t index;
1142
1143         /* make sure our super fits in the device */
1144         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1145                 return 1;
1146
1147         /* make sure our super fits in the page */
1148         if (sizeof(**disk_super) > PAGE_SIZE)
1149                 return 1;
1150
1151         /* make sure our super doesn't straddle pages on disk */
1152         index = bytenr >> PAGE_SHIFT;
1153         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1154                 return 1;
1155
1156         /* pull in the page with our super */
1157         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1158                                    index, GFP_KERNEL);
1159
1160         if (IS_ERR_OR_NULL(*page))
1161                 return 1;
1162
1163         p = kmap(*page);
1164
1165         /* align our pointer to the offset of the super block */
1166         *disk_super = p + (bytenr & ~PAGE_MASK);
1167
1168         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1169             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1170                 btrfs_release_disk_super(*page);
1171                 return 1;
1172         }
1173
1174         if ((*disk_super)->label[0] &&
1175                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1176                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1177
1178         return 0;
1179 }
1180
1181 /*
1182  * Look for a btrfs signature on a device. This may be called out of the mount path
1183  * and we are not allowed to call set_blocksize during the scan. The superblock
1184  * is read via pagecache
1185  */
1186 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1187                           struct btrfs_fs_devices **fs_devices_ret)
1188 {
1189         struct btrfs_super_block *disk_super;
1190         struct btrfs_device *device;
1191         struct block_device *bdev;
1192         struct page *page;
1193         int ret = 0;
1194         u64 bytenr;
1195
1196         /*
1197          * we would like to check all the supers, but that would make
1198          * a btrfs mount succeed after a mkfs from a different FS.
1199          * So, we need to add a special mount option to scan for
1200          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1201          */
1202         bytenr = btrfs_sb_offset(0);
1203         flags |= FMODE_EXCL;
1204         mutex_lock(&uuid_mutex);
1205
1206         bdev = blkdev_get_by_path(path, flags, holder);
1207         if (IS_ERR(bdev)) {
1208                 ret = PTR_ERR(bdev);
1209                 goto error;
1210         }
1211
1212         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1213                 ret = -EINVAL;
1214                 goto error_bdev_put;
1215         }
1216
1217         device = device_list_add(path, disk_super);
1218         if (IS_ERR(device))
1219                 ret = PTR_ERR(device);
1220         else
1221                 *fs_devices_ret = device->fs_devices;
1222
1223         btrfs_release_disk_super(page);
1224
1225 error_bdev_put:
1226         blkdev_put(bdev, flags);
1227 error:
1228         mutex_unlock(&uuid_mutex);
1229         return ret;
1230 }
1231
1232 /* helper to account the used device space in the range */
1233 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1234                                    u64 end, u64 *length)
1235 {
1236         struct btrfs_key key;
1237         struct btrfs_root *root = device->fs_info->dev_root;
1238         struct btrfs_dev_extent *dev_extent;
1239         struct btrfs_path *path;
1240         u64 extent_end;
1241         int ret;
1242         int slot;
1243         struct extent_buffer *l;
1244
1245         *length = 0;
1246
1247         if (start >= device->total_bytes ||
1248                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1249                 return 0;
1250
1251         path = btrfs_alloc_path();
1252         if (!path)
1253                 return -ENOMEM;
1254         path->reada = READA_FORWARD;
1255
1256         key.objectid = device->devid;
1257         key.offset = start;
1258         key.type = BTRFS_DEV_EXTENT_KEY;
1259
1260         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1261         if (ret < 0)
1262                 goto out;
1263         if (ret > 0) {
1264                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1265                 if (ret < 0)
1266                         goto out;
1267         }
1268
1269         while (1) {
1270                 l = path->nodes[0];
1271                 slot = path->slots[0];
1272                 if (slot >= btrfs_header_nritems(l)) {
1273                         ret = btrfs_next_leaf(root, path);
1274                         if (ret == 0)
1275                                 continue;
1276                         if (ret < 0)
1277                                 goto out;
1278
1279                         break;
1280                 }
1281                 btrfs_item_key_to_cpu(l, &key, slot);
1282
1283                 if (key.objectid < device->devid)
1284                         goto next;
1285
1286                 if (key.objectid > device->devid)
1287                         break;
1288
1289                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1290                         goto next;
1291
1292                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1293                 extent_end = key.offset + btrfs_dev_extent_length(l,
1294                                                                   dev_extent);
1295                 if (key.offset <= start && extent_end > end) {
1296                         *length = end - start + 1;
1297                         break;
1298                 } else if (key.offset <= start && extent_end > start)
1299                         *length += extent_end - start;
1300                 else if (key.offset > start && extent_end <= end)
1301                         *length += extent_end - key.offset;
1302                 else if (key.offset > start && key.offset <= end) {
1303                         *length += end - key.offset + 1;
1304                         break;
1305                 } else if (key.offset > end)
1306                         break;
1307
1308 next:
1309                 path->slots[0]++;
1310         }
1311         ret = 0;
1312 out:
1313         btrfs_free_path(path);
1314         return ret;
1315 }
1316
1317 static int contains_pending_extent(struct btrfs_transaction *transaction,
1318                                    struct btrfs_device *device,
1319                                    u64 *start, u64 len)
1320 {
1321         struct btrfs_fs_info *fs_info = device->fs_info;
1322         struct extent_map *em;
1323         struct list_head *search_list = &fs_info->pinned_chunks;
1324         int ret = 0;
1325         u64 physical_start = *start;
1326
1327         if (transaction)
1328                 search_list = &transaction->pending_chunks;
1329 again:
1330         list_for_each_entry(em, search_list, list) {
1331                 struct map_lookup *map;
1332                 int i;
1333
1334                 map = em->map_lookup;
1335                 for (i = 0; i < map->num_stripes; i++) {
1336                         u64 end;
1337
1338                         if (map->stripes[i].dev != device)
1339                                 continue;
1340                         if (map->stripes[i].physical >= physical_start + len ||
1341                             map->stripes[i].physical + em->orig_block_len <=
1342                             physical_start)
1343                                 continue;
1344                         /*
1345                          * Make sure that while processing the pinned list we do
1346                          * not override our *start with a lower value, because
1347                          * we can have pinned chunks that fall within this
1348                          * device hole and that have lower physical addresses
1349                          * than the pending chunks we processed before. If we
1350                          * do not take this special care we can end up getting
1351                          * 2 pending chunks that start at the same physical
1352                          * device offsets because the end offset of a pinned
1353                          * chunk can be equal to the start offset of some
1354                          * pending chunk.
1355                          */
1356                         end = map->stripes[i].physical + em->orig_block_len;
1357                         if (end > *start) {
1358                                 *start = end;
1359                                 ret = 1;
1360                         }
1361                 }
1362         }
1363         if (search_list != &fs_info->pinned_chunks) {
1364                 search_list = &fs_info->pinned_chunks;
1365                 goto again;
1366         }
1367
1368         return ret;
1369 }
1370
1371
1372 /*
1373  * find_free_dev_extent_start - find free space in the specified device
1374  * @device:       the device which we search the free space in
1375  * @num_bytes:    the size of the free space that we need
1376  * @search_start: the position from which to begin the search
1377  * @start:        store the start of the free space.
1378  * @len:          the size of the free space. that we find, or the size
1379  *                of the max free space if we don't find suitable free space
1380  *
1381  * this uses a pretty simple search, the expectation is that it is
1382  * called very infrequently and that a given device has a small number
1383  * of extents
1384  *
1385  * @start is used to store the start of the free space if we find. But if we
1386  * don't find suitable free space, it will be used to store the start position
1387  * of the max free space.
1388  *
1389  * @len is used to store the size of the free space that we find.
1390  * But if we don't find suitable free space, it is used to store the size of
1391  * the max free space.
1392  */
1393 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1394                                struct btrfs_device *device, u64 num_bytes,
1395                                u64 search_start, u64 *start, u64 *len)
1396 {
1397         struct btrfs_fs_info *fs_info = device->fs_info;
1398         struct btrfs_root *root = fs_info->dev_root;
1399         struct btrfs_key key;
1400         struct btrfs_dev_extent *dev_extent;
1401         struct btrfs_path *path;
1402         u64 hole_size;
1403         u64 max_hole_start;
1404         u64 max_hole_size;
1405         u64 extent_end;
1406         u64 search_end = device->total_bytes;
1407         int ret;
1408         int slot;
1409         struct extent_buffer *l;
1410
1411         /*
1412          * We don't want to overwrite the superblock on the drive nor any area
1413          * used by the boot loader (grub for example), so we make sure to start
1414          * at an offset of at least 1MB.
1415          */
1416         search_start = max_t(u64, search_start, SZ_1M);
1417
1418         path = btrfs_alloc_path();
1419         if (!path)
1420                 return -ENOMEM;
1421
1422         max_hole_start = search_start;
1423         max_hole_size = 0;
1424
1425 again:
1426         if (search_start >= search_end ||
1427                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1428                 ret = -ENOSPC;
1429                 goto out;
1430         }
1431
1432         path->reada = READA_FORWARD;
1433         path->search_commit_root = 1;
1434         path->skip_locking = 1;
1435
1436         key.objectid = device->devid;
1437         key.offset = search_start;
1438         key.type = BTRFS_DEV_EXTENT_KEY;
1439
1440         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1441         if (ret < 0)
1442                 goto out;
1443         if (ret > 0) {
1444                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1445                 if (ret < 0)
1446                         goto out;
1447         }
1448
1449         while (1) {
1450                 l = path->nodes[0];
1451                 slot = path->slots[0];
1452                 if (slot >= btrfs_header_nritems(l)) {
1453                         ret = btrfs_next_leaf(root, path);
1454                         if (ret == 0)
1455                                 continue;
1456                         if (ret < 0)
1457                                 goto out;
1458
1459                         break;
1460                 }
1461                 btrfs_item_key_to_cpu(l, &key, slot);
1462
1463                 if (key.objectid < device->devid)
1464                         goto next;
1465
1466                 if (key.objectid > device->devid)
1467                         break;
1468
1469                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1470                         goto next;
1471
1472                 if (key.offset > search_start) {
1473                         hole_size = key.offset - search_start;
1474
1475                         /*
1476                          * Have to check before we set max_hole_start, otherwise
1477                          * we could end up sending back this offset anyway.
1478                          */
1479                         if (contains_pending_extent(transaction, device,
1480                                                     &search_start,
1481                                                     hole_size)) {
1482                                 if (key.offset >= search_start) {
1483                                         hole_size = key.offset - search_start;
1484                                 } else {
1485                                         WARN_ON_ONCE(1);
1486                                         hole_size = 0;
1487                                 }
1488                         }
1489
1490                         if (hole_size > max_hole_size) {
1491                                 max_hole_start = search_start;
1492                                 max_hole_size = hole_size;
1493                         }
1494
1495                         /*
1496                          * If this free space is greater than which we need,
1497                          * it must be the max free space that we have found
1498                          * until now, so max_hole_start must point to the start
1499                          * of this free space and the length of this free space
1500                          * is stored in max_hole_size. Thus, we return
1501                          * max_hole_start and max_hole_size and go back to the
1502                          * caller.
1503                          */
1504                         if (hole_size >= num_bytes) {
1505                                 ret = 0;
1506                                 goto out;
1507                         }
1508                 }
1509
1510                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1511                 extent_end = key.offset + btrfs_dev_extent_length(l,
1512                                                                   dev_extent);
1513                 if (extent_end > search_start)
1514                         search_start = extent_end;
1515 next:
1516                 path->slots[0]++;
1517                 cond_resched();
1518         }
1519
1520         /*
1521          * At this point, search_start should be the end of
1522          * allocated dev extents, and when shrinking the device,
1523          * search_end may be smaller than search_start.
1524          */
1525         if (search_end > search_start) {
1526                 hole_size = search_end - search_start;
1527
1528                 if (contains_pending_extent(transaction, device, &search_start,
1529                                             hole_size)) {
1530                         btrfs_release_path(path);
1531                         goto again;
1532                 }
1533
1534                 if (hole_size > max_hole_size) {
1535                         max_hole_start = search_start;
1536                         max_hole_size = hole_size;
1537                 }
1538         }
1539
1540         /* See above. */
1541         if (max_hole_size < num_bytes)
1542                 ret = -ENOSPC;
1543         else
1544                 ret = 0;
1545
1546 out:
1547         btrfs_free_path(path);
1548         *start = max_hole_start;
1549         if (len)
1550                 *len = max_hole_size;
1551         return ret;
1552 }
1553
1554 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1555                          struct btrfs_device *device, u64 num_bytes,
1556                          u64 *start, u64 *len)
1557 {
1558         /* FIXME use last free of some kind */
1559         return find_free_dev_extent_start(trans->transaction, device,
1560                                           num_bytes, 0, start, len);
1561 }
1562
1563 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1564                           struct btrfs_device *device,
1565                           u64 start, u64 *dev_extent_len)
1566 {
1567         struct btrfs_fs_info *fs_info = device->fs_info;
1568         struct btrfs_root *root = fs_info->dev_root;
1569         int ret;
1570         struct btrfs_path *path;
1571         struct btrfs_key key;
1572         struct btrfs_key found_key;
1573         struct extent_buffer *leaf = NULL;
1574         struct btrfs_dev_extent *extent = NULL;
1575
1576         path = btrfs_alloc_path();
1577         if (!path)
1578                 return -ENOMEM;
1579
1580         key.objectid = device->devid;
1581         key.offset = start;
1582         key.type = BTRFS_DEV_EXTENT_KEY;
1583 again:
1584         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1585         if (ret > 0) {
1586                 ret = btrfs_previous_item(root, path, key.objectid,
1587                                           BTRFS_DEV_EXTENT_KEY);
1588                 if (ret)
1589                         goto out;
1590                 leaf = path->nodes[0];
1591                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1592                 extent = btrfs_item_ptr(leaf, path->slots[0],
1593                                         struct btrfs_dev_extent);
1594                 BUG_ON(found_key.offset > start || found_key.offset +
1595                        btrfs_dev_extent_length(leaf, extent) < start);
1596                 key = found_key;
1597                 btrfs_release_path(path);
1598                 goto again;
1599         } else if (ret == 0) {
1600                 leaf = path->nodes[0];
1601                 extent = btrfs_item_ptr(leaf, path->slots[0],
1602                                         struct btrfs_dev_extent);
1603         } else {
1604                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1605                 goto out;
1606         }
1607
1608         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1609
1610         ret = btrfs_del_item(trans, root, path);
1611         if (ret) {
1612                 btrfs_handle_fs_error(fs_info, ret,
1613                                       "Failed to remove dev extent item");
1614         } else {
1615                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1616         }
1617 out:
1618         btrfs_free_path(path);
1619         return ret;
1620 }
1621
1622 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1623                                   struct btrfs_device *device,
1624                                   u64 chunk_offset, u64 start, u64 num_bytes)
1625 {
1626         int ret;
1627         struct btrfs_path *path;
1628         struct btrfs_fs_info *fs_info = device->fs_info;
1629         struct btrfs_root *root = fs_info->dev_root;
1630         struct btrfs_dev_extent *extent;
1631         struct extent_buffer *leaf;
1632         struct btrfs_key key;
1633
1634         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1635         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1636         path = btrfs_alloc_path();
1637         if (!path)
1638                 return -ENOMEM;
1639
1640         key.objectid = device->devid;
1641         key.offset = start;
1642         key.type = BTRFS_DEV_EXTENT_KEY;
1643         ret = btrfs_insert_empty_item(trans, root, path, &key,
1644                                       sizeof(*extent));
1645         if (ret)
1646                 goto out;
1647
1648         leaf = path->nodes[0];
1649         extent = btrfs_item_ptr(leaf, path->slots[0],
1650                                 struct btrfs_dev_extent);
1651         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1652                                         BTRFS_CHUNK_TREE_OBJECTID);
1653         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1654                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1655         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1656
1657         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1658         btrfs_mark_buffer_dirty(leaf);
1659 out:
1660         btrfs_free_path(path);
1661         return ret;
1662 }
1663
1664 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1665 {
1666         struct extent_map_tree *em_tree;
1667         struct extent_map *em;
1668         struct rb_node *n;
1669         u64 ret = 0;
1670
1671         em_tree = &fs_info->mapping_tree.map_tree;
1672         read_lock(&em_tree->lock);
1673         n = rb_last(&em_tree->map);
1674         if (n) {
1675                 em = rb_entry(n, struct extent_map, rb_node);
1676                 ret = em->start + em->len;
1677         }
1678         read_unlock(&em_tree->lock);
1679
1680         return ret;
1681 }
1682
1683 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1684                                     u64 *devid_ret)
1685 {
1686         int ret;
1687         struct btrfs_key key;
1688         struct btrfs_key found_key;
1689         struct btrfs_path *path;
1690
1691         path = btrfs_alloc_path();
1692         if (!path)
1693                 return -ENOMEM;
1694
1695         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1696         key.type = BTRFS_DEV_ITEM_KEY;
1697         key.offset = (u64)-1;
1698
1699         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1700         if (ret < 0)
1701                 goto error;
1702
1703         BUG_ON(ret == 0); /* Corruption */
1704
1705         ret = btrfs_previous_item(fs_info->chunk_root, path,
1706                                   BTRFS_DEV_ITEMS_OBJECTID,
1707                                   BTRFS_DEV_ITEM_KEY);
1708         if (ret) {
1709                 *devid_ret = 1;
1710         } else {
1711                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1712                                       path->slots[0]);
1713                 *devid_ret = found_key.offset + 1;
1714         }
1715         ret = 0;
1716 error:
1717         btrfs_free_path(path);
1718         return ret;
1719 }
1720
1721 /*
1722  * the device information is stored in the chunk root
1723  * the btrfs_device struct should be fully filled in
1724  */
1725 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1726                             struct btrfs_fs_info *fs_info,
1727                             struct btrfs_device *device)
1728 {
1729         struct btrfs_root *root = fs_info->chunk_root;
1730         int ret;
1731         struct btrfs_path *path;
1732         struct btrfs_dev_item *dev_item;
1733         struct extent_buffer *leaf;
1734         struct btrfs_key key;
1735         unsigned long ptr;
1736
1737         path = btrfs_alloc_path();
1738         if (!path)
1739                 return -ENOMEM;
1740
1741         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1742         key.type = BTRFS_DEV_ITEM_KEY;
1743         key.offset = device->devid;
1744
1745         ret = btrfs_insert_empty_item(trans, root, path, &key,
1746                                       sizeof(*dev_item));
1747         if (ret)
1748                 goto out;
1749
1750         leaf = path->nodes[0];
1751         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1752
1753         btrfs_set_device_id(leaf, dev_item, device->devid);
1754         btrfs_set_device_generation(leaf, dev_item, 0);
1755         btrfs_set_device_type(leaf, dev_item, device->type);
1756         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1757         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1758         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1759         btrfs_set_device_total_bytes(leaf, dev_item,
1760                                      btrfs_device_get_disk_total_bytes(device));
1761         btrfs_set_device_bytes_used(leaf, dev_item,
1762                                     btrfs_device_get_bytes_used(device));
1763         btrfs_set_device_group(leaf, dev_item, 0);
1764         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1765         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1766         btrfs_set_device_start_offset(leaf, dev_item, 0);
1767
1768         ptr = btrfs_device_uuid(dev_item);
1769         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1770         ptr = btrfs_device_fsid(dev_item);
1771         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1772         btrfs_mark_buffer_dirty(leaf);
1773
1774         ret = 0;
1775 out:
1776         btrfs_free_path(path);
1777         return ret;
1778 }
1779
1780 /*
1781  * Function to update ctime/mtime for a given device path.
1782  * Mainly used for ctime/mtime based probe like libblkid.
1783  */
1784 static void update_dev_time(const char *path_name)
1785 {
1786         struct file *filp;
1787
1788         filp = filp_open(path_name, O_RDWR, 0);
1789         if (IS_ERR(filp))
1790                 return;
1791         file_update_time(filp);
1792         filp_close(filp, NULL);
1793 }
1794
1795 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1796                              struct btrfs_device *device)
1797 {
1798         struct btrfs_root *root = fs_info->chunk_root;
1799         int ret;
1800         struct btrfs_path *path;
1801         struct btrfs_key key;
1802         struct btrfs_trans_handle *trans;
1803
1804         path = btrfs_alloc_path();
1805         if (!path)
1806                 return -ENOMEM;
1807
1808         trans = btrfs_start_transaction(root, 0);
1809         if (IS_ERR(trans)) {
1810                 btrfs_free_path(path);
1811                 return PTR_ERR(trans);
1812         }
1813         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1814         key.type = BTRFS_DEV_ITEM_KEY;
1815         key.offset = device->devid;
1816
1817         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1818         if (ret) {
1819                 if (ret > 0)
1820                         ret = -ENOENT;
1821                 btrfs_abort_transaction(trans, ret);
1822                 btrfs_end_transaction(trans);
1823                 goto out;
1824         }
1825
1826         ret = btrfs_del_item(trans, root, path);
1827         if (ret) {
1828                 btrfs_abort_transaction(trans, ret);
1829                 btrfs_end_transaction(trans);
1830         }
1831
1832 out:
1833         btrfs_free_path(path);
1834         if (!ret)
1835                 ret = btrfs_commit_transaction(trans);
1836         return ret;
1837 }
1838
1839 /*
1840  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1841  * filesystem. It's up to the caller to adjust that number regarding eg. device
1842  * replace.
1843  */
1844 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1845                 u64 num_devices)
1846 {
1847         u64 all_avail;
1848         unsigned seq;
1849         int i;
1850
1851         do {
1852                 seq = read_seqbegin(&fs_info->profiles_lock);
1853
1854                 all_avail = fs_info->avail_data_alloc_bits |
1855                             fs_info->avail_system_alloc_bits |
1856                             fs_info->avail_metadata_alloc_bits;
1857         } while (read_seqretry(&fs_info->profiles_lock, seq));
1858
1859         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1860                 if (!(all_avail & btrfs_raid_group[i]))
1861                         continue;
1862
1863                 if (num_devices < btrfs_raid_array[i].devs_min) {
1864                         int ret = btrfs_raid_mindev_error[i];
1865
1866                         if (ret)
1867                                 return ret;
1868                 }
1869         }
1870
1871         return 0;
1872 }
1873
1874 static struct btrfs_device * btrfs_find_next_active_device(
1875                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1876 {
1877         struct btrfs_device *next_device;
1878
1879         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1880                 if (next_device != device &&
1881                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1882                     && next_device->bdev)
1883                         return next_device;
1884         }
1885
1886         return NULL;
1887 }
1888
1889 /*
1890  * Helper function to check if the given device is part of s_bdev / latest_bdev
1891  * and replace it with the provided or the next active device, in the context
1892  * where this function called, there should be always be another device (or
1893  * this_dev) which is active.
1894  */
1895 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1896                 struct btrfs_device *device, struct btrfs_device *this_dev)
1897 {
1898         struct btrfs_device *next_device;
1899
1900         if (this_dev)
1901                 next_device = this_dev;
1902         else
1903                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1904                                                                 device);
1905         ASSERT(next_device);
1906
1907         if (fs_info->sb->s_bdev &&
1908                         (fs_info->sb->s_bdev == device->bdev))
1909                 fs_info->sb->s_bdev = next_device->bdev;
1910
1911         if (fs_info->fs_devices->latest_bdev == device->bdev)
1912                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1913 }
1914
1915 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1916                 u64 devid)
1917 {
1918         struct btrfs_device *device;
1919         struct btrfs_fs_devices *cur_devices;
1920         u64 num_devices;
1921         int ret = 0;
1922
1923         mutex_lock(&fs_info->volume_mutex);
1924         mutex_lock(&uuid_mutex);
1925
1926         num_devices = fs_info->fs_devices->num_devices;
1927         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1928         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1929                 WARN_ON(num_devices < 1);
1930                 num_devices--;
1931         }
1932         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1933
1934         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1935         if (ret)
1936                 goto out;
1937
1938         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1939                                            &device);
1940         if (ret)
1941                 goto out;
1942
1943         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1944                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1945                 goto out;
1946         }
1947
1948         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1949             fs_info->fs_devices->rw_devices == 1) {
1950                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1951                 goto out;
1952         }
1953
1954         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1955                 mutex_lock(&fs_info->chunk_mutex);
1956                 list_del_init(&device->dev_alloc_list);
1957                 device->fs_devices->rw_devices--;
1958                 mutex_unlock(&fs_info->chunk_mutex);
1959         }
1960
1961         mutex_unlock(&uuid_mutex);
1962         ret = btrfs_shrink_device(device, 0);
1963         mutex_lock(&uuid_mutex);
1964         if (ret)
1965                 goto error_undo;
1966
1967         /*
1968          * TODO: the superblock still includes this device in its num_devices
1969          * counter although write_all_supers() is not locked out. This
1970          * could give a filesystem state which requires a degraded mount.
1971          */
1972         ret = btrfs_rm_dev_item(fs_info, device);
1973         if (ret)
1974                 goto error_undo;
1975
1976         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1977         btrfs_scrub_cancel_dev(fs_info, device);
1978
1979         /*
1980          * the device list mutex makes sure that we don't change
1981          * the device list while someone else is writing out all
1982          * the device supers. Whoever is writing all supers, should
1983          * lock the device list mutex before getting the number of
1984          * devices in the super block (super_copy). Conversely,
1985          * whoever updates the number of devices in the super block
1986          * (super_copy) should hold the device list mutex.
1987          */
1988
1989         cur_devices = device->fs_devices;
1990         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1991         list_del_rcu(&device->dev_list);
1992
1993         device->fs_devices->num_devices--;
1994         device->fs_devices->total_devices--;
1995
1996         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1997                 device->fs_devices->missing_devices--;
1998
1999         btrfs_assign_next_active_device(fs_info, device, NULL);
2000
2001         if (device->bdev) {
2002                 device->fs_devices->open_devices--;
2003                 /* remove sysfs entry */
2004                 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2005         }
2006
2007         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2008         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2009         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2010
2011         /*
2012          * at this point, the device is zero sized and detached from
2013          * the devices list.  All that's left is to zero out the old
2014          * supers and free the device.
2015          */
2016         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2017                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2018
2019         btrfs_close_bdev(device);
2020         call_rcu(&device->rcu, free_device_rcu);
2021
2022         if (cur_devices->open_devices == 0) {
2023                 struct btrfs_fs_devices *fs_devices;
2024                 fs_devices = fs_info->fs_devices;
2025                 while (fs_devices) {
2026                         if (fs_devices->seed == cur_devices) {
2027                                 fs_devices->seed = cur_devices->seed;
2028                                 break;
2029                         }
2030                         fs_devices = fs_devices->seed;
2031                 }
2032                 cur_devices->seed = NULL;
2033                 __btrfs_close_devices(cur_devices);
2034                 free_fs_devices(cur_devices);
2035         }
2036
2037 out:
2038         mutex_unlock(&uuid_mutex);
2039         mutex_unlock(&fs_info->volume_mutex);
2040         return ret;
2041
2042 error_undo:
2043         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2044                 mutex_lock(&fs_info->chunk_mutex);
2045                 list_add(&device->dev_alloc_list,
2046                          &fs_info->fs_devices->alloc_list);
2047                 device->fs_devices->rw_devices++;
2048                 mutex_unlock(&fs_info->chunk_mutex);
2049         }
2050         goto out;
2051 }
2052
2053 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
2054                                         struct btrfs_device *srcdev)
2055 {
2056         struct btrfs_fs_devices *fs_devices;
2057
2058         lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2059
2060         /*
2061          * in case of fs with no seed, srcdev->fs_devices will point
2062          * to fs_devices of fs_info. However when the dev being replaced is
2063          * a seed dev it will point to the seed's local fs_devices. In short
2064          * srcdev will have its correct fs_devices in both the cases.
2065          */
2066         fs_devices = srcdev->fs_devices;
2067
2068         list_del_rcu(&srcdev->dev_list);
2069         list_del(&srcdev->dev_alloc_list);
2070         fs_devices->num_devices--;
2071         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2072                 fs_devices->missing_devices--;
2073
2074         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2075                 fs_devices->rw_devices--;
2076
2077         if (srcdev->bdev)
2078                 fs_devices->open_devices--;
2079 }
2080
2081 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2082                                       struct btrfs_device *srcdev)
2083 {
2084         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2085
2086         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2087                 /* zero out the old super if it is writable */
2088                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2089         }
2090
2091         btrfs_close_bdev(srcdev);
2092         call_rcu(&srcdev->rcu, free_device_rcu);
2093
2094         /* if this is no devs we rather delete the fs_devices */
2095         if (!fs_devices->num_devices) {
2096                 struct btrfs_fs_devices *tmp_fs_devices;
2097
2098                 /*
2099                  * On a mounted FS, num_devices can't be zero unless it's a
2100                  * seed. In case of a seed device being replaced, the replace
2101                  * target added to the sprout FS, so there will be no more
2102                  * device left under the seed FS.
2103                  */
2104                 ASSERT(fs_devices->seeding);
2105
2106                 tmp_fs_devices = fs_info->fs_devices;
2107                 while (tmp_fs_devices) {
2108                         if (tmp_fs_devices->seed == fs_devices) {
2109                                 tmp_fs_devices->seed = fs_devices->seed;
2110                                 break;
2111                         }
2112                         tmp_fs_devices = tmp_fs_devices->seed;
2113                 }
2114                 fs_devices->seed = NULL;
2115                 __btrfs_close_devices(fs_devices);
2116                 free_fs_devices(fs_devices);
2117         }
2118 }
2119
2120 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2121                                       struct btrfs_device *tgtdev)
2122 {
2123         mutex_lock(&uuid_mutex);
2124         WARN_ON(!tgtdev);
2125         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2126
2127         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2128
2129         if (tgtdev->bdev)
2130                 fs_info->fs_devices->open_devices--;
2131
2132         fs_info->fs_devices->num_devices--;
2133
2134         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2135
2136         list_del_rcu(&tgtdev->dev_list);
2137
2138         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2139         mutex_unlock(&uuid_mutex);
2140
2141         /*
2142          * The update_dev_time() with in btrfs_scratch_superblocks()
2143          * may lead to a call to btrfs_show_devname() which will try
2144          * to hold device_list_mutex. And here this device
2145          * is already out of device list, so we don't have to hold
2146          * the device_list_mutex lock.
2147          */
2148         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2149
2150         btrfs_close_bdev(tgtdev);
2151         call_rcu(&tgtdev->rcu, free_device_rcu);
2152 }
2153
2154 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2155                                      const char *device_path,
2156                                      struct btrfs_device **device)
2157 {
2158         int ret = 0;
2159         struct btrfs_super_block *disk_super;
2160         u64 devid;
2161         u8 *dev_uuid;
2162         struct block_device *bdev;
2163         struct buffer_head *bh;
2164
2165         *device = NULL;
2166         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2167                                     fs_info->bdev_holder, 0, &bdev, &bh);
2168         if (ret)
2169                 return ret;
2170         disk_super = (struct btrfs_super_block *)bh->b_data;
2171         devid = btrfs_stack_device_id(&disk_super->dev_item);
2172         dev_uuid = disk_super->dev_item.uuid;
2173         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2174         brelse(bh);
2175         if (!*device)
2176                 ret = -ENOENT;
2177         blkdev_put(bdev, FMODE_READ);
2178         return ret;
2179 }
2180
2181 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2182                                          const char *device_path,
2183                                          struct btrfs_device **device)
2184 {
2185         *device = NULL;
2186         if (strcmp(device_path, "missing") == 0) {
2187                 struct list_head *devices;
2188                 struct btrfs_device *tmp;
2189
2190                 devices = &fs_info->fs_devices->devices;
2191                 /*
2192                  * It is safe to read the devices since the volume_mutex
2193                  * is held by the caller.
2194                  */
2195                 list_for_each_entry(tmp, devices, dev_list) {
2196                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2197                                         &tmp->dev_state) && !tmp->bdev) {
2198                                 *device = tmp;
2199                                 break;
2200                         }
2201                 }
2202
2203                 if (!*device)
2204                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2205
2206                 return 0;
2207         } else {
2208                 return btrfs_find_device_by_path(fs_info, device_path, device);
2209         }
2210 }
2211
2212 /*
2213  * Lookup a device given by device id, or the path if the id is 0.
2214  */
2215 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2216                                  const char *devpath,
2217                                  struct btrfs_device **device)
2218 {
2219         int ret;
2220
2221         if (devid) {
2222                 ret = 0;
2223                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2224                 if (!*device)
2225                         ret = -ENOENT;
2226         } else {
2227                 if (!devpath || !devpath[0])
2228                         return -EINVAL;
2229
2230                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2231                                                            device);
2232         }
2233         return ret;
2234 }
2235
2236 /*
2237  * does all the dirty work required for changing file system's UUID.
2238  */
2239 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2240 {
2241         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2242         struct btrfs_fs_devices *old_devices;
2243         struct btrfs_fs_devices *seed_devices;
2244         struct btrfs_super_block *disk_super = fs_info->super_copy;
2245         struct btrfs_device *device;
2246         u64 super_flags;
2247
2248         lockdep_assert_held(&uuid_mutex);
2249         if (!fs_devices->seeding)
2250                 return -EINVAL;
2251
2252         seed_devices = alloc_fs_devices(NULL);
2253         if (IS_ERR(seed_devices))
2254                 return PTR_ERR(seed_devices);
2255
2256         old_devices = clone_fs_devices(fs_devices);
2257         if (IS_ERR(old_devices)) {
2258                 kfree(seed_devices);
2259                 return PTR_ERR(old_devices);
2260         }
2261
2262         list_add(&old_devices->list, &fs_uuids);
2263
2264         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2265         seed_devices->opened = 1;
2266         INIT_LIST_HEAD(&seed_devices->devices);
2267         INIT_LIST_HEAD(&seed_devices->alloc_list);
2268         mutex_init(&seed_devices->device_list_mutex);
2269
2270         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2271         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2272                               synchronize_rcu);
2273         list_for_each_entry(device, &seed_devices->devices, dev_list)
2274                 device->fs_devices = seed_devices;
2275
2276         mutex_lock(&fs_info->chunk_mutex);
2277         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2278         mutex_unlock(&fs_info->chunk_mutex);
2279
2280         fs_devices->seeding = 0;
2281         fs_devices->num_devices = 0;
2282         fs_devices->open_devices = 0;
2283         fs_devices->missing_devices = 0;
2284         fs_devices->rotating = 0;
2285         fs_devices->seed = seed_devices;
2286
2287         generate_random_uuid(fs_devices->fsid);
2288         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2289         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2290         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2291
2292         super_flags = btrfs_super_flags(disk_super) &
2293                       ~BTRFS_SUPER_FLAG_SEEDING;
2294         btrfs_set_super_flags(disk_super, super_flags);
2295
2296         return 0;
2297 }
2298
2299 /*
2300  * Store the expected generation for seed devices in device items.
2301  */
2302 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2303                                struct btrfs_fs_info *fs_info)
2304 {
2305         struct btrfs_root *root = fs_info->chunk_root;
2306         struct btrfs_path *path;
2307         struct extent_buffer *leaf;
2308         struct btrfs_dev_item *dev_item;
2309         struct btrfs_device *device;
2310         struct btrfs_key key;
2311         u8 fs_uuid[BTRFS_FSID_SIZE];
2312         u8 dev_uuid[BTRFS_UUID_SIZE];
2313         u64 devid;
2314         int ret;
2315
2316         path = btrfs_alloc_path();
2317         if (!path)
2318                 return -ENOMEM;
2319
2320         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2321         key.offset = 0;
2322         key.type = BTRFS_DEV_ITEM_KEY;
2323
2324         while (1) {
2325                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2326                 if (ret < 0)
2327                         goto error;
2328
2329                 leaf = path->nodes[0];
2330 next_slot:
2331                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2332                         ret = btrfs_next_leaf(root, path);
2333                         if (ret > 0)
2334                                 break;
2335                         if (ret < 0)
2336                                 goto error;
2337                         leaf = path->nodes[0];
2338                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2339                         btrfs_release_path(path);
2340                         continue;
2341                 }
2342
2343                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2344                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2345                     key.type != BTRFS_DEV_ITEM_KEY)
2346                         break;
2347
2348                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2349                                           struct btrfs_dev_item);
2350                 devid = btrfs_device_id(leaf, dev_item);
2351                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2352                                    BTRFS_UUID_SIZE);
2353                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2354                                    BTRFS_FSID_SIZE);
2355                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2356                 BUG_ON(!device); /* Logic error */
2357
2358                 if (device->fs_devices->seeding) {
2359                         btrfs_set_device_generation(leaf, dev_item,
2360                                                     device->generation);
2361                         btrfs_mark_buffer_dirty(leaf);
2362                 }
2363
2364                 path->slots[0]++;
2365                 goto next_slot;
2366         }
2367         ret = 0;
2368 error:
2369         btrfs_free_path(path);
2370         return ret;
2371 }
2372
2373 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2374 {
2375         struct btrfs_root *root = fs_info->dev_root;
2376         struct request_queue *q;
2377         struct btrfs_trans_handle *trans;
2378         struct btrfs_device *device;
2379         struct block_device *bdev;
2380         struct list_head *devices;
2381         struct super_block *sb = fs_info->sb;
2382         struct rcu_string *name;
2383         u64 tmp;
2384         int seeding_dev = 0;
2385         int ret = 0;
2386         bool unlocked = false;
2387
2388         if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2389                 return -EROFS;
2390
2391         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2392                                   fs_info->bdev_holder);
2393         if (IS_ERR(bdev))
2394                 return PTR_ERR(bdev);
2395
2396         if (fs_info->fs_devices->seeding) {
2397                 seeding_dev = 1;
2398                 down_write(&sb->s_umount);
2399                 mutex_lock(&uuid_mutex);
2400         }
2401
2402         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2403
2404         devices = &fs_info->fs_devices->devices;
2405
2406         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2407         list_for_each_entry(device, devices, dev_list) {
2408                 if (device->bdev == bdev) {
2409                         ret = -EEXIST;
2410                         mutex_unlock(
2411                                 &fs_info->fs_devices->device_list_mutex);
2412                         goto error;
2413                 }
2414         }
2415         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2416
2417         device = btrfs_alloc_device(fs_info, NULL, NULL);
2418         if (IS_ERR(device)) {
2419                 /* we can safely leave the fs_devices entry around */
2420                 ret = PTR_ERR(device);
2421                 goto error;
2422         }
2423
2424         name = rcu_string_strdup(device_path, GFP_KERNEL);
2425         if (!name) {
2426                 ret = -ENOMEM;
2427                 goto error_free_device;
2428         }
2429         rcu_assign_pointer(device->name, name);
2430
2431         trans = btrfs_start_transaction(root, 0);
2432         if (IS_ERR(trans)) {
2433                 ret = PTR_ERR(trans);
2434                 goto error_free_device;
2435         }
2436
2437         q = bdev_get_queue(bdev);
2438         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2439         device->generation = trans->transid;
2440         device->io_width = fs_info->sectorsize;
2441         device->io_align = fs_info->sectorsize;
2442         device->sector_size = fs_info->sectorsize;
2443         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2444                                          fs_info->sectorsize);
2445         device->disk_total_bytes = device->total_bytes;
2446         device->commit_total_bytes = device->total_bytes;
2447         device->fs_info = fs_info;
2448         device->bdev = bdev;
2449         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2450         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2451         device->mode = FMODE_EXCL;
2452         device->dev_stats_valid = 1;
2453         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2454
2455         if (seeding_dev) {
2456                 sb->s_flags &= ~SB_RDONLY;
2457                 ret = btrfs_prepare_sprout(fs_info);
2458                 if (ret) {
2459                         btrfs_abort_transaction(trans, ret);
2460                         goto error_trans;
2461                 }
2462         }
2463
2464         device->fs_devices = fs_info->fs_devices;
2465
2466         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2467         mutex_lock(&fs_info->chunk_mutex);
2468         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2469         list_add(&device->dev_alloc_list,
2470                  &fs_info->fs_devices->alloc_list);
2471         fs_info->fs_devices->num_devices++;
2472         fs_info->fs_devices->open_devices++;
2473         fs_info->fs_devices->rw_devices++;
2474         fs_info->fs_devices->total_devices++;
2475         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2476
2477         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2478
2479         if (!blk_queue_nonrot(q))
2480                 fs_info->fs_devices->rotating = 1;
2481
2482         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2483         btrfs_set_super_total_bytes(fs_info->super_copy,
2484                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2485
2486         tmp = btrfs_super_num_devices(fs_info->super_copy);
2487         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2488
2489         /* add sysfs device entry */
2490         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2491
2492         /*
2493          * we've got more storage, clear any full flags on the space
2494          * infos
2495          */
2496         btrfs_clear_space_info_full(fs_info);
2497
2498         mutex_unlock(&fs_info->chunk_mutex);
2499         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2500
2501         if (seeding_dev) {
2502                 mutex_lock(&fs_info->chunk_mutex);
2503                 ret = init_first_rw_device(trans, fs_info);
2504                 mutex_unlock(&fs_info->chunk_mutex);
2505                 if (ret) {
2506                         btrfs_abort_transaction(trans, ret);
2507                         goto error_sysfs;
2508                 }
2509         }
2510
2511         ret = btrfs_add_dev_item(trans, fs_info, device);
2512         if (ret) {
2513                 btrfs_abort_transaction(trans, ret);
2514                 goto error_sysfs;
2515         }
2516
2517         if (seeding_dev) {
2518                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2519
2520                 ret = btrfs_finish_sprout(trans, fs_info);
2521                 if (ret) {
2522                         btrfs_abort_transaction(trans, ret);
2523                         goto error_sysfs;
2524                 }
2525
2526                 /* Sprouting would change fsid of the mounted root,
2527                  * so rename the fsid on the sysfs
2528                  */
2529                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2530                                                 fs_info->fsid);
2531                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2532                         btrfs_warn(fs_info,
2533                                    "sysfs: failed to create fsid for sprout");
2534         }
2535
2536         ret = btrfs_commit_transaction(trans);
2537
2538         if (seeding_dev) {
2539                 mutex_unlock(&uuid_mutex);
2540                 up_write(&sb->s_umount);
2541                 unlocked = true;
2542
2543                 if (ret) /* transaction commit */
2544                         return ret;
2545
2546                 ret = btrfs_relocate_sys_chunks(fs_info);
2547                 if (ret < 0)
2548                         btrfs_handle_fs_error(fs_info, ret,
2549                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2550                 trans = btrfs_attach_transaction(root);
2551                 if (IS_ERR(trans)) {
2552                         if (PTR_ERR(trans) == -ENOENT)
2553                                 return 0;
2554                         ret = PTR_ERR(trans);
2555                         trans = NULL;
2556                         goto error_sysfs;
2557                 }
2558                 ret = btrfs_commit_transaction(trans);
2559         }
2560
2561         /* Update ctime/mtime for libblkid */
2562         update_dev_time(device_path);
2563         return ret;
2564
2565 error_sysfs:
2566         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2567 error_trans:
2568         if (seeding_dev)
2569                 sb->s_flags |= SB_RDONLY;
2570         if (trans)
2571                 btrfs_end_transaction(trans);
2572 error_free_device:
2573         free_device(device);
2574 error:
2575         blkdev_put(bdev, FMODE_EXCL);
2576         if (seeding_dev && !unlocked) {
2577                 mutex_unlock(&uuid_mutex);
2578                 up_write(&sb->s_umount);
2579         }
2580         return ret;
2581 }
2582
2583 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2584                                   const char *device_path,
2585                                   struct btrfs_device *srcdev,
2586                                   struct btrfs_device **device_out)
2587 {
2588         struct btrfs_device *device;
2589         struct block_device *bdev;
2590         struct list_head *devices;
2591         struct rcu_string *name;
2592         u64 devid = BTRFS_DEV_REPLACE_DEVID;
2593         int ret = 0;
2594
2595         *device_out = NULL;
2596         if (fs_info->fs_devices->seeding) {
2597                 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2598                 return -EINVAL;
2599         }
2600
2601         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2602                                   fs_info->bdev_holder);
2603         if (IS_ERR(bdev)) {
2604                 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2605                 return PTR_ERR(bdev);
2606         }
2607
2608         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2609
2610         devices = &fs_info->fs_devices->devices;
2611         list_for_each_entry(device, devices, dev_list) {
2612                 if (device->bdev == bdev) {
2613                         btrfs_err(fs_info,
2614                                   "target device is in the filesystem!");
2615                         ret = -EEXIST;
2616                         goto error;
2617                 }
2618         }
2619
2620
2621         if (i_size_read(bdev->bd_inode) <
2622             btrfs_device_get_total_bytes(srcdev)) {
2623                 btrfs_err(fs_info,
2624                           "target device is smaller than source device!");
2625                 ret = -EINVAL;
2626                 goto error;
2627         }
2628
2629
2630         device = btrfs_alloc_device(NULL, &devid, NULL);
2631         if (IS_ERR(device)) {
2632                 ret = PTR_ERR(device);
2633                 goto error;
2634         }
2635
2636         name = rcu_string_strdup(device_path, GFP_KERNEL);
2637         if (!name) {
2638                 free_device(device);
2639                 ret = -ENOMEM;
2640                 goto error;
2641         }
2642         rcu_assign_pointer(device->name, name);
2643
2644         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2645         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2646         device->generation = 0;
2647         device->io_width = fs_info->sectorsize;
2648         device->io_align = fs_info->sectorsize;
2649         device->sector_size = fs_info->sectorsize;
2650         device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2651         device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2652         device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2653         device->commit_total_bytes = srcdev->commit_total_bytes;
2654         device->commit_bytes_used = device->bytes_used;
2655         device->fs_info = fs_info;
2656         device->bdev = bdev;
2657         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2658         set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2659         device->mode = FMODE_EXCL;
2660         device->dev_stats_valid = 1;
2661         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2662         device->fs_devices = fs_info->fs_devices;
2663         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2664         fs_info->fs_devices->num_devices++;
2665         fs_info->fs_devices->open_devices++;
2666         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2667
2668         *device_out = device;
2669         return ret;
2670
2671 error:
2672         blkdev_put(bdev, FMODE_EXCL);
2673         return ret;
2674 }
2675
2676 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2677                                         struct btrfs_device *device)
2678 {
2679         int ret;
2680         struct btrfs_path *path;
2681         struct btrfs_root *root = device->fs_info->chunk_root;
2682         struct btrfs_dev_item *dev_item;
2683         struct extent_buffer *leaf;
2684         struct btrfs_key key;
2685
2686         path = btrfs_alloc_path();
2687         if (!path)
2688                 return -ENOMEM;
2689
2690         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2691         key.type = BTRFS_DEV_ITEM_KEY;
2692         key.offset = device->devid;
2693
2694         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2695         if (ret < 0)
2696                 goto out;
2697
2698         if (ret > 0) {
2699                 ret = -ENOENT;
2700                 goto out;
2701         }
2702
2703         leaf = path->nodes[0];
2704         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2705
2706         btrfs_set_device_id(leaf, dev_item, device->devid);
2707         btrfs_set_device_type(leaf, dev_item, device->type);
2708         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2709         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2710         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2711         btrfs_set_device_total_bytes(leaf, dev_item,
2712                                      btrfs_device_get_disk_total_bytes(device));
2713         btrfs_set_device_bytes_used(leaf, dev_item,
2714                                     btrfs_device_get_bytes_used(device));
2715         btrfs_mark_buffer_dirty(leaf);
2716
2717 out:
2718         btrfs_free_path(path);
2719         return ret;
2720 }
2721
2722 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2723                       struct btrfs_device *device, u64 new_size)
2724 {
2725         struct btrfs_fs_info *fs_info = device->fs_info;
2726         struct btrfs_super_block *super_copy = fs_info->super_copy;
2727         struct btrfs_fs_devices *fs_devices;
2728         u64 old_total;
2729         u64 diff;
2730
2731         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2732                 return -EACCES;
2733
2734         new_size = round_down(new_size, fs_info->sectorsize);
2735
2736         mutex_lock(&fs_info->chunk_mutex);
2737         old_total = btrfs_super_total_bytes(super_copy);
2738         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2739
2740         if (new_size <= device->total_bytes ||
2741             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2742                 mutex_unlock(&fs_info->chunk_mutex);
2743                 return -EINVAL;
2744         }
2745
2746         fs_devices = fs_info->fs_devices;
2747
2748         btrfs_set_super_total_bytes(super_copy,
2749                         round_down(old_total + diff, fs_info->sectorsize));
2750         device->fs_devices->total_rw_bytes += diff;
2751
2752         btrfs_device_set_total_bytes(device, new_size);
2753         btrfs_device_set_disk_total_bytes(device, new_size);
2754         btrfs_clear_space_info_full(device->fs_info);
2755         if (list_empty(&device->resized_list))
2756                 list_add_tail(&device->resized_list,
2757                               &fs_devices->resized_devices);
2758         mutex_unlock(&fs_info->chunk_mutex);
2759
2760         return btrfs_update_device(trans, device);
2761 }
2762
2763 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2764                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2765 {
2766         struct btrfs_root *root = fs_info->chunk_root;
2767         int ret;
2768         struct btrfs_path *path;
2769         struct btrfs_key key;
2770
2771         path = btrfs_alloc_path();
2772         if (!path)
2773                 return -ENOMEM;
2774
2775         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2776         key.offset = chunk_offset;
2777         key.type = BTRFS_CHUNK_ITEM_KEY;
2778
2779         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2780         if (ret < 0)
2781                 goto out;
2782         else if (ret > 0) { /* Logic error or corruption */
2783                 btrfs_handle_fs_error(fs_info, -ENOENT,
2784                                       "Failed lookup while freeing chunk.");
2785                 ret = -ENOENT;
2786                 goto out;
2787         }
2788
2789         ret = btrfs_del_item(trans, root, path);
2790         if (ret < 0)
2791                 btrfs_handle_fs_error(fs_info, ret,
2792                                       "Failed to delete chunk item.");
2793 out:
2794         btrfs_free_path(path);
2795         return ret;
2796 }
2797
2798 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2799 {
2800         struct btrfs_super_block *super_copy = fs_info->super_copy;
2801         struct btrfs_disk_key *disk_key;
2802         struct btrfs_chunk *chunk;
2803         u8 *ptr;
2804         int ret = 0;
2805         u32 num_stripes;
2806         u32 array_size;
2807         u32 len = 0;
2808         u32 cur;
2809         struct btrfs_key key;
2810
2811         mutex_lock(&fs_info->chunk_mutex);
2812         array_size = btrfs_super_sys_array_size(super_copy);
2813
2814         ptr = super_copy->sys_chunk_array;
2815         cur = 0;
2816
2817         while (cur < array_size) {
2818                 disk_key = (struct btrfs_disk_key *)ptr;
2819                 btrfs_disk_key_to_cpu(&key, disk_key);
2820
2821                 len = sizeof(*disk_key);
2822
2823                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2824                         chunk = (struct btrfs_chunk *)(ptr + len);
2825                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2826                         len += btrfs_chunk_item_size(num_stripes);
2827                 } else {
2828                         ret = -EIO;
2829                         break;
2830                 }
2831                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2832                     key.offset == chunk_offset) {
2833                         memmove(ptr, ptr + len, array_size - (cur + len));
2834                         array_size -= len;
2835                         btrfs_set_super_sys_array_size(super_copy, array_size);
2836                 } else {
2837                         ptr += len;
2838                         cur += len;
2839                 }
2840         }
2841         mutex_unlock(&fs_info->chunk_mutex);
2842         return ret;
2843 }
2844
2845 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2846                                         u64 logical, u64 length)
2847 {
2848         struct extent_map_tree *em_tree;
2849         struct extent_map *em;
2850
2851         em_tree = &fs_info->mapping_tree.map_tree;
2852         read_lock(&em_tree->lock);
2853         em = lookup_extent_mapping(em_tree, logical, length);
2854         read_unlock(&em_tree->lock);
2855
2856         if (!em) {
2857                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2858                            logical, length);
2859                 return ERR_PTR(-EINVAL);
2860         }
2861
2862         if (em->start > logical || em->start + em->len < logical) {
2863                 btrfs_crit(fs_info,
2864                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2865                            logical, length, em->start, em->start + em->len);
2866                 free_extent_map(em);
2867                 return ERR_PTR(-EINVAL);
2868         }
2869
2870         /* callers are responsible for dropping em's ref. */
2871         return em;
2872 }
2873
2874 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2875                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2876 {
2877         struct extent_map *em;
2878         struct map_lookup *map;
2879         u64 dev_extent_len = 0;
2880         int i, ret = 0;
2881         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2882
2883         em = get_chunk_map(fs_info, chunk_offset, 1);
2884         if (IS_ERR(em)) {
2885                 /*
2886                  * This is a logic error, but we don't want to just rely on the
2887                  * user having built with ASSERT enabled, so if ASSERT doesn't
2888                  * do anything we still error out.
2889                  */
2890                 ASSERT(0);
2891                 return PTR_ERR(em);
2892         }
2893         map = em->map_lookup;
2894         mutex_lock(&fs_info->chunk_mutex);
2895         check_system_chunk(trans, fs_info, map->type);
2896         mutex_unlock(&fs_info->chunk_mutex);
2897
2898         /*
2899          * Take the device list mutex to prevent races with the final phase of
2900          * a device replace operation that replaces the device object associated
2901          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2902          */
2903         mutex_lock(&fs_devices->device_list_mutex);
2904         for (i = 0; i < map->num_stripes; i++) {
2905                 struct btrfs_device *device = map->stripes[i].dev;
2906                 ret = btrfs_free_dev_extent(trans, device,
2907                                             map->stripes[i].physical,
2908                                             &dev_extent_len);
2909                 if (ret) {
2910                         mutex_unlock(&fs_devices->device_list_mutex);
2911                         btrfs_abort_transaction(trans, ret);
2912                         goto out;
2913                 }
2914
2915                 if (device->bytes_used > 0) {
2916                         mutex_lock(&fs_info->chunk_mutex);
2917                         btrfs_device_set_bytes_used(device,
2918                                         device->bytes_used - dev_extent_len);
2919                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2920                         btrfs_clear_space_info_full(fs_info);
2921                         mutex_unlock(&fs_info->chunk_mutex);
2922                 }
2923
2924                 if (map->stripes[i].dev) {
2925                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2926                         if (ret) {
2927                                 mutex_unlock(&fs_devices->device_list_mutex);
2928                                 btrfs_abort_transaction(trans, ret);
2929                                 goto out;
2930                         }
2931                 }
2932         }
2933         mutex_unlock(&fs_devices->device_list_mutex);
2934
2935         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2936         if (ret) {
2937                 btrfs_abort_transaction(trans, ret);
2938                 goto out;
2939         }
2940
2941         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2942
2943         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2944                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2945                 if (ret) {
2946                         btrfs_abort_transaction(trans, ret);
2947                         goto out;
2948                 }
2949         }
2950
2951         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2952         if (ret) {
2953                 btrfs_abort_transaction(trans, ret);
2954                 goto out;
2955         }
2956
2957 out:
2958         /* once for us */
2959         free_extent_map(em);
2960         return ret;
2961 }
2962
2963 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2964 {
2965         struct btrfs_root *root = fs_info->chunk_root;
2966         struct btrfs_trans_handle *trans;
2967         int ret;
2968
2969         /*
2970          * Prevent races with automatic removal of unused block groups.
2971          * After we relocate and before we remove the chunk with offset
2972          * chunk_offset, automatic removal of the block group can kick in,
2973          * resulting in a failure when calling btrfs_remove_chunk() below.
2974          *
2975          * Make sure to acquire this mutex before doing a tree search (dev
2976          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2977          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2978          * we release the path used to search the chunk/dev tree and before
2979          * the current task acquires this mutex and calls us.
2980          */
2981         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2982
2983         ret = btrfs_can_relocate(fs_info, chunk_offset);
2984         if (ret)
2985                 return -ENOSPC;
2986
2987         /* step one, relocate all the extents inside this chunk */
2988         btrfs_scrub_pause(fs_info);
2989         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2990         btrfs_scrub_continue(fs_info);
2991         if (ret)
2992                 return ret;
2993
2994         /*
2995          * We add the kobjects here (and after forcing data chunk creation)
2996          * since relocation is the only place we'll create chunks of a new
2997          * type at runtime.  The only place where we'll remove the last
2998          * chunk of a type is the call immediately below this one.  Even
2999          * so, we're protected against races with the cleaner thread since
3000          * we're covered by the delete_unused_bgs_mutex.
3001          */
3002         btrfs_add_raid_kobjects(fs_info);
3003
3004         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3005                                                      chunk_offset);
3006         if (IS_ERR(trans)) {
3007                 ret = PTR_ERR(trans);
3008                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3009                 return ret;
3010         }
3011
3012         /*
3013          * step two, delete the device extents and the
3014          * chunk tree entries
3015          */
3016         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
3017         btrfs_end_transaction(trans);
3018         return ret;
3019 }
3020
3021 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3022 {
3023         struct btrfs_root *chunk_root = fs_info->chunk_root;
3024         struct btrfs_path *path;
3025         struct extent_buffer *leaf;
3026         struct btrfs_chunk *chunk;
3027         struct btrfs_key key;
3028         struct btrfs_key found_key;
3029         u64 chunk_type;
3030         bool retried = false;
3031         int failed = 0;
3032         int ret;
3033
3034         path = btrfs_alloc_path();
3035         if (!path)
3036                 return -ENOMEM;
3037
3038 again:
3039         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3040         key.offset = (u64)-1;
3041         key.type = BTRFS_CHUNK_ITEM_KEY;
3042
3043         while (1) {
3044                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3045                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3046                 if (ret < 0) {
3047                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3048                         goto error;
3049                 }
3050                 BUG_ON(ret == 0); /* Corruption */
3051
3052                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3053                                           key.type);
3054                 if (ret)
3055                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3056                 if (ret < 0)
3057                         goto error;
3058                 if (ret > 0)
3059                         break;
3060
3061                 leaf = path->nodes[0];
3062                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3063
3064                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3065                                        struct btrfs_chunk);
3066                 chunk_type = btrfs_chunk_type(leaf, chunk);
3067                 btrfs_release_path(path);
3068
3069                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3070                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3071                         if (ret == -ENOSPC)
3072                                 failed++;
3073                         else
3074                                 BUG_ON(ret);
3075                 }
3076                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3077
3078                 if (found_key.offset == 0)
3079                         break;
3080                 key.offset = found_key.offset - 1;
3081         }
3082         ret = 0;
3083         if (failed && !retried) {
3084                 failed = 0;
3085                 retried = true;
3086                 goto again;
3087         } else if (WARN_ON(failed && retried)) {
3088                 ret = -ENOSPC;
3089         }
3090 error:
3091         btrfs_free_path(path);
3092         return ret;
3093 }
3094
3095 /*
3096  * return 1 : allocate a data chunk successfully,
3097  * return <0: errors during allocating a data chunk,
3098  * return 0 : no need to allocate a data chunk.
3099  */
3100 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3101                                       u64 chunk_offset)
3102 {
3103         struct btrfs_block_group_cache *cache;
3104         u64 bytes_used;
3105         u64 chunk_type;
3106
3107         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3108         ASSERT(cache);
3109         chunk_type = cache->flags;
3110         btrfs_put_block_group(cache);
3111
3112         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3113                 spin_lock(&fs_info->data_sinfo->lock);
3114                 bytes_used = fs_info->data_sinfo->bytes_used;
3115                 spin_unlock(&fs_info->data_sinfo->lock);
3116
3117                 if (!bytes_used) {
3118                         struct btrfs_trans_handle *trans;
3119                         int ret;
3120
3121                         trans = btrfs_join_transaction(fs_info->tree_root);
3122                         if (IS_ERR(trans))
3123                                 return PTR_ERR(trans);
3124
3125                         ret = btrfs_force_chunk_alloc(trans, fs_info,
3126                                                       BTRFS_BLOCK_GROUP_DATA);
3127                         btrfs_end_transaction(trans);
3128                         if (ret < 0)
3129                                 return ret;
3130
3131                         btrfs_add_raid_kobjects(fs_info);
3132
3133                         return 1;
3134                 }
3135         }
3136         return 0;
3137 }
3138
3139 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3140                                struct btrfs_balance_control *bctl)
3141 {
3142         struct btrfs_root *root = fs_info->tree_root;
3143         struct btrfs_trans_handle *trans;
3144         struct btrfs_balance_item *item;
3145         struct btrfs_disk_balance_args disk_bargs;
3146         struct btrfs_path *path;
3147         struct extent_buffer *leaf;
3148         struct btrfs_key key;
3149         int ret, err;
3150
3151         path = btrfs_alloc_path();
3152         if (!path)
3153                 return -ENOMEM;
3154
3155         trans = btrfs_start_transaction(root, 0);
3156         if (IS_ERR(trans)) {
3157                 btrfs_free_path(path);
3158                 return PTR_ERR(trans);
3159         }
3160
3161         key.objectid = BTRFS_BALANCE_OBJECTID;
3162         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3163         key.offset = 0;
3164
3165         ret = btrfs_insert_empty_item(trans, root, path, &key,
3166                                       sizeof(*item));
3167         if (ret)
3168                 goto out;
3169
3170         leaf = path->nodes[0];
3171         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3172
3173         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3174
3175         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3176         btrfs_set_balance_data(leaf, item, &disk_bargs);
3177         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3178         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3179         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3180         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3181
3182         btrfs_set_balance_flags(leaf, item, bctl->flags);
3183
3184         btrfs_mark_buffer_dirty(leaf);
3185 out:
3186         btrfs_free_path(path);
3187         err = btrfs_commit_transaction(trans);
3188         if (err && !ret)
3189                 ret = err;
3190         return ret;
3191 }
3192
3193 static int del_balance_item(struct btrfs_fs_info *fs_info)
3194 {
3195         struct btrfs_root *root = fs_info->tree_root;
3196         struct btrfs_trans_handle *trans;
3197         struct btrfs_path *path;
3198         struct btrfs_key key;
3199         int ret, err;
3200
3201         path = btrfs_alloc_path();
3202         if (!path)
3203                 return -ENOMEM;
3204
3205         trans = btrfs_start_transaction(root, 0);
3206         if (IS_ERR(trans)) {
3207                 btrfs_free_path(path);
3208                 return PTR_ERR(trans);
3209         }
3210
3211         key.objectid = BTRFS_BALANCE_OBJECTID;
3212         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3213         key.offset = 0;
3214
3215         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3216         if (ret < 0)
3217                 goto out;
3218         if (ret > 0) {
3219                 ret = -ENOENT;
3220                 goto out;
3221         }
3222
3223         ret = btrfs_del_item(trans, root, path);
3224 out:
3225         btrfs_free_path(path);
3226         err = btrfs_commit_transaction(trans);
3227         if (err && !ret)
3228                 ret = err;
3229         return ret;
3230 }
3231
3232 /*
3233  * This is a heuristic used to reduce the number of chunks balanced on
3234  * resume after balance was interrupted.
3235  */
3236 static void update_balance_args(struct btrfs_balance_control *bctl)
3237 {
3238         /*
3239          * Turn on soft mode for chunk types that were being converted.
3240          */
3241         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3242                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3243         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3244                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3245         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3246                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3247
3248         /*
3249          * Turn on usage filter if is not already used.  The idea is
3250          * that chunks that we have already balanced should be
3251          * reasonably full.  Don't do it for chunks that are being
3252          * converted - that will keep us from relocating unconverted
3253          * (albeit full) chunks.
3254          */
3255         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3256             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3257             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3258                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3259                 bctl->data.usage = 90;
3260         }
3261         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3262             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3263             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3264                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3265                 bctl->sys.usage = 90;
3266         }
3267         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3268             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3269             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3270                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3271                 bctl->meta.usage = 90;
3272         }
3273 }
3274
3275 /*
3276  * Should be called with both balance and volume mutexes held to
3277  * serialize other volume operations (add_dev/rm_dev/resize) with
3278  * restriper.  Same goes for unset_balance_control.
3279  */
3280 static void set_balance_control(struct btrfs_balance_control *bctl)
3281 {
3282         struct btrfs_fs_info *fs_info = bctl->fs_info;
3283
3284         BUG_ON(fs_info->balance_ctl);
3285
3286         spin_lock(&fs_info->balance_lock);
3287         fs_info->balance_ctl = bctl;
3288         spin_unlock(&fs_info->balance_lock);
3289 }
3290
3291 static void unset_balance_control(struct btrfs_fs_info *fs_info)
3292 {
3293         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3294
3295         BUG_ON(!fs_info->balance_ctl);
3296
3297         spin_lock(&fs_info->balance_lock);
3298         fs_info->balance_ctl = NULL;
3299         spin_unlock(&fs_info->balance_lock);
3300
3301         kfree(bctl);
3302 }
3303
3304 /*
3305  * Balance filters.  Return 1 if chunk should be filtered out
3306  * (should not be balanced).
3307  */
3308 static int chunk_profiles_filter(u64 chunk_type,
3309                                  struct btrfs_balance_args *bargs)
3310 {
3311         chunk_type = chunk_to_extended(chunk_type) &
3312                                 BTRFS_EXTENDED_PROFILE_MASK;
3313
3314         if (bargs->profiles & chunk_type)
3315                 return 0;
3316
3317         return 1;
3318 }
3319
3320 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3321                               struct btrfs_balance_args *bargs)
3322 {
3323         struct btrfs_block_group_cache *cache;
3324         u64 chunk_used;
3325         u64 user_thresh_min;
3326         u64 user_thresh_max;
3327         int ret = 1;
3328
3329         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3330         chunk_used = btrfs_block_group_used(&cache->item);
3331
3332         if (bargs->usage_min == 0)
3333                 user_thresh_min = 0;
3334         else
3335                 user_thresh_min = div_factor_fine(cache->key.offset,
3336                                         bargs->usage_min);
3337
3338         if (bargs->usage_max == 0)
3339                 user_thresh_max = 1;
3340         else if (bargs->usage_max > 100)
3341                 user_thresh_max = cache->key.offset;
3342         else
3343                 user_thresh_max = div_factor_fine(cache->key.offset,
3344                                         bargs->usage_max);
3345
3346         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3347                 ret = 0;
3348
3349         btrfs_put_block_group(cache);
3350         return ret;
3351 }
3352
3353 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3354                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3355 {
3356         struct btrfs_block_group_cache *cache;
3357         u64 chunk_used, user_thresh;
3358         int ret = 1;
3359
3360         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3361         chunk_used = btrfs_block_group_used(&cache->item);
3362
3363         if (bargs->usage_min == 0)
3364                 user_thresh = 1;
3365         else if (bargs->usage > 100)
3366                 user_thresh = cache->key.offset;
3367         else
3368                 user_thresh = div_factor_fine(cache->key.offset,
3369                                               bargs->usage);
3370
3371         if (chunk_used < user_thresh)
3372                 ret = 0;
3373
3374         btrfs_put_block_group(cache);
3375         return ret;
3376 }
3377
3378 static int chunk_devid_filter(struct extent_buffer *leaf,
3379                               struct btrfs_chunk *chunk,
3380                               struct btrfs_balance_args *bargs)
3381 {
3382         struct btrfs_stripe *stripe;
3383         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3384         int i;
3385
3386         for (i = 0; i < num_stripes; i++) {
3387                 stripe = btrfs_stripe_nr(chunk, i);
3388                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3389                         return 0;
3390         }
3391
3392         return 1;
3393 }
3394
3395 /* [pstart, pend) */
3396 static int chunk_drange_filter(struct extent_buffer *leaf,
3397                                struct btrfs_chunk *chunk,
3398                                struct btrfs_balance_args *bargs)
3399 {
3400         struct btrfs_stripe *stripe;
3401         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3402         u64 stripe_offset;
3403         u64 stripe_length;
3404         int factor;
3405         int i;
3406
3407         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3408                 return 0;
3409
3410         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3411              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3412                 factor = num_stripes / 2;
3413         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3414                 factor = num_stripes - 1;
3415         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3416                 factor = num_stripes - 2;
3417         } else {
3418                 factor = num_stripes;
3419         }
3420
3421         for (i = 0; i < num_stripes; i++) {
3422                 stripe = btrfs_stripe_nr(chunk, i);
3423                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3424                         continue;
3425
3426                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3427                 stripe_length = btrfs_chunk_length(leaf, chunk);
3428                 stripe_length = div_u64(stripe_length, factor);
3429
3430                 if (stripe_offset < bargs->pend &&
3431                     stripe_offset + stripe_length > bargs->pstart)
3432                         return 0;
3433         }
3434
3435         return 1;
3436 }
3437
3438 /* [vstart, vend) */
3439 static int chunk_vrange_filter(struct extent_buffer *leaf,
3440                                struct btrfs_chunk *chunk,
3441                                u64 chunk_offset,
3442                                struct btrfs_balance_args *bargs)
3443 {
3444         if (chunk_offset < bargs->vend &&
3445             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3446                 /* at least part of the chunk is inside this vrange */
3447                 return 0;
3448
3449         return 1;
3450 }
3451
3452 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3453                                struct btrfs_chunk *chunk,
3454                                struct btrfs_balance_args *bargs)
3455 {
3456         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3457
3458         if (bargs->stripes_min <= num_stripes
3459                         && num_stripes <= bargs->stripes_max)
3460                 return 0;
3461
3462         return 1;
3463 }
3464
3465 static int chunk_soft_convert_filter(u64 chunk_type,
3466                                      struct btrfs_balance_args *bargs)
3467 {
3468         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3469                 return 0;
3470
3471         chunk_type = chunk_to_extended(chunk_type) &
3472                                 BTRFS_EXTENDED_PROFILE_MASK;
3473
3474         if (bargs->target == chunk_type)
3475                 return 1;
3476
3477         return 0;
3478 }
3479
3480 static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3481                                 struct extent_buffer *leaf,
3482                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3483 {
3484         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3485         struct btrfs_balance_args *bargs = NULL;
3486         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3487
3488         /* type filter */
3489         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3490               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3491                 return 0;
3492         }
3493
3494         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3495                 bargs = &bctl->data;
3496         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3497                 bargs = &bctl->sys;
3498         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3499                 bargs = &bctl->meta;
3500
3501         /* profiles filter */
3502         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3503             chunk_profiles_filter(chunk_type, bargs)) {
3504                 return 0;
3505         }
3506
3507         /* usage filter */
3508         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3509             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3510                 return 0;
3511         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3512             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3513                 return 0;
3514         }
3515
3516         /* devid filter */
3517         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3518             chunk_devid_filter(leaf, chunk, bargs)) {
3519                 return 0;
3520         }
3521
3522         /* drange filter, makes sense only with devid filter */
3523         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3524             chunk_drange_filter(leaf, chunk, bargs)) {
3525                 return 0;
3526         }
3527
3528         /* vrange filter */
3529         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3530             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3531                 return 0;
3532         }
3533
3534         /* stripes filter */
3535         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3536             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3537                 return 0;
3538         }
3539
3540         /* soft profile changing mode */
3541         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3542             chunk_soft_convert_filter(chunk_type, bargs)) {
3543                 return 0;
3544         }
3545
3546         /*
3547          * limited by count, must be the last filter
3548          */
3549         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3550                 if (bargs->limit == 0)
3551                         return 0;
3552                 else
3553                         bargs->limit--;
3554         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3555                 /*
3556                  * Same logic as the 'limit' filter; the minimum cannot be
3557                  * determined here because we do not have the global information
3558                  * about the count of all chunks that satisfy the filters.
3559                  */
3560                 if (bargs->limit_max == 0)
3561                         return 0;
3562                 else
3563                         bargs->limit_max--;
3564         }
3565
3566         return 1;
3567 }
3568
3569 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3570 {
3571         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3572         struct btrfs_root *chunk_root = fs_info->chunk_root;
3573         struct btrfs_root *dev_root = fs_info->dev_root;
3574         struct list_head *devices;
3575         struct btrfs_device *device;
3576         u64 old_size;
3577         u64 size_to_free;
3578         u64 chunk_type;
3579         struct btrfs_chunk *chunk;
3580         struct btrfs_path *path = NULL;
3581         struct btrfs_key key;
3582         struct btrfs_key found_key;
3583         struct btrfs_trans_handle *trans;
3584         struct extent_buffer *leaf;
3585         int slot;
3586         int ret;
3587         int enospc_errors = 0;
3588         bool counting = true;
3589         /* The single value limit and min/max limits use the same bytes in the */
3590         u64 limit_data = bctl->data.limit;
3591         u64 limit_meta = bctl->meta.limit;
3592         u64 limit_sys = bctl->sys.limit;
3593         u32 count_data = 0;
3594         u32 count_meta = 0;
3595         u32 count_sys = 0;
3596         int chunk_reserved = 0;
3597
3598         /* step one make some room on all the devices */
3599         devices = &fs_info->fs_devices->devices;
3600         list_for_each_entry(device, devices, dev_list) {
3601                 old_size = btrfs_device_get_total_bytes(device);
3602                 size_to_free = div_factor(old_size, 1);
3603                 size_to_free = min_t(u64, size_to_free, SZ_1M);
3604                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3605                     btrfs_device_get_total_bytes(device) -
3606                     btrfs_device_get_bytes_used(device) > size_to_free ||
3607                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3608                         continue;
3609
3610                 ret = btrfs_shrink_device(device, old_size - size_to_free);
3611                 if (ret == -ENOSPC)
3612                         break;
3613                 if (ret) {
3614                         /* btrfs_shrink_device never returns ret > 0 */
3615                         WARN_ON(ret > 0);
3616                         goto error;
3617                 }
3618
3619                 trans = btrfs_start_transaction(dev_root, 0);
3620                 if (IS_ERR(trans)) {
3621                         ret = PTR_ERR(trans);
3622                         btrfs_info_in_rcu(fs_info,
3623                  "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3624                                           rcu_str_deref(device->name), ret,
3625                                           old_size, old_size - size_to_free);
3626                         goto error;
3627                 }
3628
3629                 ret = btrfs_grow_device(trans, device, old_size);
3630                 if (ret) {
3631                         btrfs_end_transaction(trans);
3632                         /* btrfs_grow_device never returns ret > 0 */
3633                         WARN_ON(ret > 0);
3634                         btrfs_info_in_rcu(fs_info,
3635                  "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3636                                           rcu_str_deref(device->name), ret,
3637                                           old_size, old_size - size_to_free);
3638                         goto error;
3639                 }
3640
3641                 btrfs_end_transaction(trans);
3642         }
3643
3644         /* step two, relocate all the chunks */
3645         path = btrfs_alloc_path();
3646         if (!path) {
3647                 ret = -ENOMEM;
3648                 goto error;
3649         }
3650
3651         /* zero out stat counters */
3652         spin_lock(&fs_info->balance_lock);
3653         memset(&bctl->stat, 0, sizeof(bctl->stat));
3654         spin_unlock(&fs_info->balance_lock);
3655 again:
3656         if (!counting) {
3657                 /*
3658                  * The single value limit and min/max limits use the same bytes
3659                  * in the
3660                  */
3661                 bctl->data.limit = limit_data;
3662                 bctl->meta.limit = limit_meta;
3663                 bctl->sys.limit = limit_sys;
3664         }
3665         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3666         key.offset = (u64)-1;
3667         key.type = BTRFS_CHUNK_ITEM_KEY;
3668
3669         while (1) {
3670                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3671                     atomic_read(&fs_info->balance_cancel_req)) {
3672                         ret = -ECANCELED;
3673                         goto error;
3674                 }
3675
3676                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3677                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3678                 if (ret < 0) {
3679                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3680                         goto error;
3681                 }
3682
3683                 /*
3684                  * this shouldn't happen, it means the last relocate
3685                  * failed
3686                  */
3687                 if (ret == 0)
3688                         BUG(); /* FIXME break ? */
3689
3690                 ret = btrfs_previous_item(chunk_root, path, 0,
3691                                           BTRFS_CHUNK_ITEM_KEY);
3692                 if (ret) {
3693                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3694                         ret = 0;
3695                         break;
3696                 }
3697
3698                 leaf = path->nodes[0];
3699                 slot = path->slots[0];
3700                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3701
3702                 if (found_key.objectid != key.objectid) {
3703                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3704                         break;
3705                 }
3706
3707                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3708                 chunk_type = btrfs_chunk_type(leaf, chunk);
3709
3710                 if (!counting) {
3711                         spin_lock(&fs_info->balance_lock);
3712                         bctl->stat.considered++;
3713                         spin_unlock(&fs_info->balance_lock);
3714                 }
3715
3716                 ret = should_balance_chunk(fs_info, leaf, chunk,
3717                                            found_key.offset);
3718
3719                 btrfs_release_path(path);
3720                 if (!ret) {
3721                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3722                         goto loop;
3723                 }
3724
3725                 if (counting) {
3726                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3727                         spin_lock(&fs_info->balance_lock);
3728                         bctl->stat.expected++;
3729                         spin_unlock(&fs_info->balance_lock);
3730
3731                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3732                                 count_data++;
3733                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3734                                 count_sys++;
3735                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3736                                 count_meta++;
3737
3738                         goto loop;
3739                 }
3740
3741                 /*
3742                  * Apply limit_min filter, no need to check if the LIMITS
3743                  * filter is used, limit_min is 0 by default
3744                  */
3745                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3746                                         count_data < bctl->data.limit_min)
3747                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3748                                         count_meta < bctl->meta.limit_min)
3749                      &nbs