btrfs: keep device list sorted
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/iocontext.h>
24 #include <linux/capability.h>
25 #include <linux/ratelimit.h>
26 #include <linux/kthread.h>
27 #include <linux/raid/pq.h>
28 #include <linux/semaphore.h>
29 #include <linux/uuid.h>
30 #include <linux/list_sort.h>
31 #include <asm/div64.h>
32 #include "ctree.h"
33 #include "extent_map.h"
34 #include "disk-io.h"
35 #include "transaction.h"
36 #include "print-tree.h"
37 #include "volumes.h"
38 #include "raid56.h"
39 #include "async-thread.h"
40 #include "check-integrity.h"
41 #include "rcu-string.h"
42 #include "math.h"
43 #include "dev-replace.h"
44 #include "sysfs.h"
45
46 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
47         [BTRFS_RAID_RAID10] = {
48                 .sub_stripes    = 2,
49                 .dev_stripes    = 1,
50                 .devs_max       = 0,    /* 0 == as many as possible */
51                 .devs_min       = 4,
52                 .tolerated_failures = 1,
53                 .devs_increment = 2,
54                 .ncopies        = 2,
55         },
56         [BTRFS_RAID_RAID1] = {
57                 .sub_stripes    = 1,
58                 .dev_stripes    = 1,
59                 .devs_max       = 2,
60                 .devs_min       = 2,
61                 .tolerated_failures = 1,
62                 .devs_increment = 2,
63                 .ncopies        = 2,
64         },
65         [BTRFS_RAID_DUP] = {
66                 .sub_stripes    = 1,
67                 .dev_stripes    = 2,
68                 .devs_max       = 1,
69                 .devs_min       = 1,
70                 .tolerated_failures = 0,
71                 .devs_increment = 1,
72                 .ncopies        = 2,
73         },
74         [BTRFS_RAID_RAID0] = {
75                 .sub_stripes    = 1,
76                 .dev_stripes    = 1,
77                 .devs_max       = 0,
78                 .devs_min       = 2,
79                 .tolerated_failures = 0,
80                 .devs_increment = 1,
81                 .ncopies        = 1,
82         },
83         [BTRFS_RAID_SINGLE] = {
84                 .sub_stripes    = 1,
85                 .dev_stripes    = 1,
86                 .devs_max       = 1,
87                 .devs_min       = 1,
88                 .tolerated_failures = 0,
89                 .devs_increment = 1,
90                 .ncopies        = 1,
91         },
92         [BTRFS_RAID_RAID5] = {
93                 .sub_stripes    = 1,
94                 .dev_stripes    = 1,
95                 .devs_max       = 0,
96                 .devs_min       = 2,
97                 .tolerated_failures = 1,
98                 .devs_increment = 1,
99                 .ncopies        = 2,
100         },
101         [BTRFS_RAID_RAID6] = {
102                 .sub_stripes    = 1,
103                 .dev_stripes    = 1,
104                 .devs_max       = 0,
105                 .devs_min       = 3,
106                 .tolerated_failures = 2,
107                 .devs_increment = 1,
108                 .ncopies        = 3,
109         },
110 };
111
112 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
113         [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
114         [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
115         [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
116         [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
117         [BTRFS_RAID_SINGLE] = 0,
118         [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
119         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
120 };
121
122 /*
123  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
124  * condition is not met. Zero means there's no corresponding
125  * BTRFS_ERROR_DEV_*_NOT_MET value.
126  */
127 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
128         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
129         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
130         [BTRFS_RAID_DUP]    = 0,
131         [BTRFS_RAID_RAID0]  = 0,
132         [BTRFS_RAID_SINGLE] = 0,
133         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
134         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
135 };
136
137 static int init_first_rw_device(struct btrfs_trans_handle *trans,
138                                 struct btrfs_fs_info *fs_info);
139 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
140 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
141 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
142 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
143 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
144                              enum btrfs_map_op op,
145                              u64 logical, u64 *length,
146                              struct btrfs_bio **bbio_ret,
147                              int mirror_num, int need_raid_map);
148
149 /*
150  * Device locking
151  * ==============
152  *
153  * There are several mutexes that protect manipulation of devices and low-level
154  * structures like chunks but not block groups, extents or files
155  *
156  * uuid_mutex (global lock)
157  * ------------------------
158  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
159  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
160  * device) or requested by the device= mount option
161  *
162  * the mutex can be very coarse and can cover long-running operations
163  *
164  * protects: updates to fs_devices counters like missing devices, rw devices,
165  * seeding, structure cloning, openning/closing devices at mount/umount time
166  *
167  * global::fs_devs - add, remove, updates to the global list
168  *
169  * does not protect: manipulation of the fs_devices::devices list!
170  *
171  * btrfs_device::name - renames (write side), read is RCU
172  *
173  * fs_devices::device_list_mutex (per-fs, with RCU)
174  * ------------------------------------------------
175  * protects updates to fs_devices::devices, ie. adding and deleting
176  *
177  * simple list traversal with read-only actions can be done with RCU protection
178  *
179  * may be used to exclude some operations from running concurrently without any
180  * modifications to the list (see write_all_supers)
181  *
182  * volume_mutex
183  * ------------
184  * coarse lock owned by a mounted filesystem; used to exclude some operations
185  * that cannot run in parallel and affect the higher-level properties of the
186  * filesystem like: device add/deleting/resize/replace, or balance
187  *
188  * balance_mutex
189  * -------------
190  * protects balance structures (status, state) and context accessed from
191  * several places (internally, ioctl)
192  *
193  * chunk_mutex
194  * -----------
195  * protects chunks, adding or removing during allocation, trim or when a new
196  * device is added/removed
197  *
198  * cleaner_mutex
199  * -------------
200  * a big lock that is held by the cleaner thread and prevents running subvolume
201  * cleaning together with relocation or delayed iputs
202  *
203  *
204  * Lock nesting
205  * ============
206  *
207  * uuid_mutex
208  *   volume_mutex
209  *     device_list_mutex
210  *       chunk_mutex
211  *     balance_mutex
212  */
213
214 DEFINE_MUTEX(uuid_mutex);
215 static LIST_HEAD(fs_uuids);
216 struct list_head *btrfs_get_fs_uuids(void)
217 {
218         return &fs_uuids;
219 }
220
221 /*
222  * alloc_fs_devices - allocate struct btrfs_fs_devices
223  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
224  *
225  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
226  * The returned struct is not linked onto any lists and can be destroyed with
227  * kfree() right away.
228  */
229 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
230 {
231         struct btrfs_fs_devices *fs_devs;
232
233         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
234         if (!fs_devs)
235                 return ERR_PTR(-ENOMEM);
236
237         mutex_init(&fs_devs->device_list_mutex);
238
239         INIT_LIST_HEAD(&fs_devs->devices);
240         INIT_LIST_HEAD(&fs_devs->resized_devices);
241         INIT_LIST_HEAD(&fs_devs->alloc_list);
242         INIT_LIST_HEAD(&fs_devs->list);
243         if (fsid)
244                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
245
246         return fs_devs;
247 }
248
249 static void free_device(struct btrfs_device *device)
250 {
251         rcu_string_free(device->name);
252         bio_put(device->flush_bio);
253         kfree(device);
254 }
255
256 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
257 {
258         struct btrfs_device *device;
259         WARN_ON(fs_devices->opened);
260         while (!list_empty(&fs_devices->devices)) {
261                 device = list_entry(fs_devices->devices.next,
262                                     struct btrfs_device, dev_list);
263                 list_del(&device->dev_list);
264                 free_device(device);
265         }
266         kfree(fs_devices);
267 }
268
269 static void btrfs_kobject_uevent(struct block_device *bdev,
270                                  enum kobject_action action)
271 {
272         int ret;
273
274         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
275         if (ret)
276                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
277                         action,
278                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
279                         &disk_to_dev(bdev->bd_disk)->kobj);
280 }
281
282 void btrfs_cleanup_fs_uuids(void)
283 {
284         struct btrfs_fs_devices *fs_devices;
285
286         while (!list_empty(&fs_uuids)) {
287                 fs_devices = list_entry(fs_uuids.next,
288                                         struct btrfs_fs_devices, list);
289                 list_del(&fs_devices->list);
290                 free_fs_devices(fs_devices);
291         }
292 }
293
294 /*
295  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
296  * Returned struct is not linked onto any lists and must be destroyed using
297  * free_device.
298  */
299 static struct btrfs_device *__alloc_device(void)
300 {
301         struct btrfs_device *dev;
302
303         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
304         if (!dev)
305                 return ERR_PTR(-ENOMEM);
306
307         /*
308          * Preallocate a bio that's always going to be used for flushing device
309          * barriers and matches the device lifespan
310          */
311         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
312         if (!dev->flush_bio) {
313                 kfree(dev);
314                 return ERR_PTR(-ENOMEM);
315         }
316
317         INIT_LIST_HEAD(&dev->dev_list);
318         INIT_LIST_HEAD(&dev->dev_alloc_list);
319         INIT_LIST_HEAD(&dev->resized_list);
320
321         spin_lock_init(&dev->io_lock);
322
323         atomic_set(&dev->reada_in_flight, 0);
324         atomic_set(&dev->dev_stats_ccnt, 0);
325         btrfs_device_data_ordered_init(dev);
326         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
327         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
328
329         return dev;
330 }
331
332 /*
333  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
334  * return NULL.
335  *
336  * If devid and uuid are both specified, the match must be exact, otherwise
337  * only devid is used.
338  */
339 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
340                 u64 devid, const u8 *uuid)
341 {
342         struct list_head *head = &fs_devices->devices;
343         struct btrfs_device *dev;
344
345         list_for_each_entry(dev, head, dev_list) {
346                 if (dev->devid == devid &&
347                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
348                         return dev;
349                 }
350         }
351         return NULL;
352 }
353
354 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
355 {
356         struct btrfs_fs_devices *fs_devices;
357
358         list_for_each_entry(fs_devices, &fs_uuids, list) {
359                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
360                         return fs_devices;
361         }
362         return NULL;
363 }
364
365 static int
366 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
367                       int flush, struct block_device **bdev,
368                       struct buffer_head **bh)
369 {
370         int ret;
371
372         *bdev = blkdev_get_by_path(device_path, flags, holder);
373
374         if (IS_ERR(*bdev)) {
375                 ret = PTR_ERR(*bdev);
376                 goto error;
377         }
378
379         if (flush)
380                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
381         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
382         if (ret) {
383                 blkdev_put(*bdev, flags);
384                 goto error;
385         }
386         invalidate_bdev(*bdev);
387         *bh = btrfs_read_dev_super(*bdev);
388         if (IS_ERR(*bh)) {
389                 ret = PTR_ERR(*bh);
390                 blkdev_put(*bdev, flags);
391                 goto error;
392         }
393
394         return 0;
395
396 error:
397         *bdev = NULL;
398         *bh = NULL;
399         return ret;
400 }
401
402 static void requeue_list(struct btrfs_pending_bios *pending_bios,
403                         struct bio *head, struct bio *tail)
404 {
405
406         struct bio *old_head;
407
408         old_head = pending_bios->head;
409         pending_bios->head = head;
410         if (pending_bios->tail)
411                 tail->bi_next = old_head;
412         else
413                 pending_bios->tail = tail;
414 }
415
416 /*
417  * we try to collect pending bios for a device so we don't get a large
418  * number of procs sending bios down to the same device.  This greatly
419  * improves the schedulers ability to collect and merge the bios.
420  *
421  * But, it also turns into a long list of bios to process and that is sure
422  * to eventually make the worker thread block.  The solution here is to
423  * make some progress and then put this work struct back at the end of
424  * the list if the block device is congested.  This way, multiple devices
425  * can make progress from a single worker thread.
426  */
427 static noinline void run_scheduled_bios(struct btrfs_device *device)
428 {
429         struct btrfs_fs_info *fs_info = device->fs_info;
430         struct bio *pending;
431         struct backing_dev_info *bdi;
432         struct btrfs_pending_bios *pending_bios;
433         struct bio *tail;
434         struct bio *cur;
435         int again = 0;
436         unsigned long num_run;
437         unsigned long batch_run = 0;
438         unsigned long last_waited = 0;
439         int force_reg = 0;
440         int sync_pending = 0;
441         struct blk_plug plug;
442
443         /*
444          * this function runs all the bios we've collected for
445          * a particular device.  We don't want to wander off to
446          * another device without first sending all of these down.
447          * So, setup a plug here and finish it off before we return
448          */
449         blk_start_plug(&plug);
450
451         bdi = device->bdev->bd_bdi;
452
453 loop:
454         spin_lock(&device->io_lock);
455
456 loop_lock:
457         num_run = 0;
458
459         /* take all the bios off the list at once and process them
460          * later on (without the lock held).  But, remember the
461          * tail and other pointers so the bios can be properly reinserted
462          * into the list if we hit congestion
463          */
464         if (!force_reg && device->pending_sync_bios.head) {
465                 pending_bios = &device->pending_sync_bios;
466                 force_reg = 1;
467         } else {
468                 pending_bios = &device->pending_bios;
469                 force_reg = 0;
470         }
471
472         pending = pending_bios->head;
473         tail = pending_bios->tail;
474         WARN_ON(pending && !tail);
475
476         /*
477          * if pending was null this time around, no bios need processing
478          * at all and we can stop.  Otherwise it'll loop back up again
479          * and do an additional check so no bios are missed.
480          *
481          * device->running_pending is used to synchronize with the
482          * schedule_bio code.
483          */
484         if (device->pending_sync_bios.head == NULL &&
485             device->pending_bios.head == NULL) {
486                 again = 0;
487                 device->running_pending = 0;
488         } else {
489                 again = 1;
490                 device->running_pending = 1;
491         }
492
493         pending_bios->head = NULL;
494         pending_bios->tail = NULL;
495
496         spin_unlock(&device->io_lock);
497
498         while (pending) {
499
500                 rmb();
501                 /* we want to work on both lists, but do more bios on the
502                  * sync list than the regular list
503                  */
504                 if ((num_run > 32 &&
505                     pending_bios != &device->pending_sync_bios &&
506                     device->pending_sync_bios.head) ||
507                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
508                     device->pending_bios.head)) {
509                         spin_lock(&device->io_lock);
510                         requeue_list(pending_bios, pending, tail);
511                         goto loop_lock;
512                 }
513
514                 cur = pending;
515                 pending = pending->bi_next;
516                 cur->bi_next = NULL;
517
518                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
519
520                 /*
521                  * if we're doing the sync list, record that our
522                  * plug has some sync requests on it
523                  *
524                  * If we're doing the regular list and there are
525                  * sync requests sitting around, unplug before
526                  * we add more
527                  */
528                 if (pending_bios == &device->pending_sync_bios) {
529                         sync_pending = 1;
530                 } else if (sync_pending) {
531                         blk_finish_plug(&plug);
532                         blk_start_plug(&plug);
533                         sync_pending = 0;
534                 }
535
536                 btrfsic_submit_bio(cur);
537                 num_run++;
538                 batch_run++;
539
540                 cond_resched();
541
542                 /*
543                  * we made progress, there is more work to do and the bdi
544                  * is now congested.  Back off and let other work structs
545                  * run instead
546                  */
547                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
548                     fs_info->fs_devices->open_devices > 1) {
549                         struct io_context *ioc;
550
551                         ioc = current->io_context;
552
553                         /*
554                          * the main goal here is that we don't want to
555                          * block if we're going to be able to submit
556                          * more requests without blocking.
557                          *
558                          * This code does two great things, it pokes into
559                          * the elevator code from a filesystem _and_
560                          * it makes assumptions about how batching works.
561                          */
562                         if (ioc && ioc->nr_batch_requests > 0 &&
563                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
564                             (last_waited == 0 ||
565                              ioc->last_waited == last_waited)) {
566                                 /*
567                                  * we want to go through our batch of
568                                  * requests and stop.  So, we copy out
569                                  * the ioc->last_waited time and test
570                                  * against it before looping
571                                  */
572                                 last_waited = ioc->last_waited;
573                                 cond_resched();
574                                 continue;
575                         }
576                         spin_lock(&device->io_lock);
577                         requeue_list(pending_bios, pending, tail);
578                         device->running_pending = 1;
579
580                         spin_unlock(&device->io_lock);
581                         btrfs_queue_work(fs_info->submit_workers,
582                                          &device->work);
583                         goto done;
584                 }
585         }
586
587         cond_resched();
588         if (again)
589                 goto loop;
590
591         spin_lock(&device->io_lock);
592         if (device->pending_bios.head || device->pending_sync_bios.head)
593                 goto loop_lock;
594         spin_unlock(&device->io_lock);
595
596 done:
597         blk_finish_plug(&plug);
598 }
599
600 static void pending_bios_fn(struct btrfs_work *work)
601 {
602         struct btrfs_device *device;
603
604         device = container_of(work, struct btrfs_device, work);
605         run_scheduled_bios(device);
606 }
607
608 /*
609  *  Search and remove all stale (devices which are not mounted) devices.
610  *  When both inputs are NULL, it will search and release all stale devices.
611  *  path:       Optional. When provided will it release all unmounted devices
612  *              matching this path only.
613  *  skip_dev:   Optional. Will skip this device when searching for the stale
614  *              devices.
615  */
616 static void btrfs_free_stale_devices(const char *path,
617                                      struct btrfs_device *skip_dev)
618 {
619         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
620         struct btrfs_device *dev, *tmp_dev;
621
622         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
623
624                 if (fs_devs->opened)
625                         continue;
626
627                 list_for_each_entry_safe(dev, tmp_dev,
628                                          &fs_devs->devices, dev_list) {
629                         int not_found = 0;
630
631                         if (skip_dev && skip_dev == dev)
632                                 continue;
633                         if (path && !dev->name)
634                                 continue;
635
636                         rcu_read_lock();
637                         if (path)
638                                 not_found = strcmp(rcu_str_deref(dev->name),
639                                                    path);
640                         rcu_read_unlock();
641                         if (not_found)
642                                 continue;
643
644                         /* delete the stale device */
645                         if (fs_devs->num_devices == 1) {
646                                 btrfs_sysfs_remove_fsid(fs_devs);
647                                 list_del(&fs_devs->list);
648                                 free_fs_devices(fs_devs);
649                                 break;
650                         } else {
651                                 fs_devs->num_devices--;
652                                 list_del(&dev->dev_list);
653                                 free_device(dev);
654                         }
655                 }
656         }
657 }
658
659 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
660                         struct btrfs_device *device, fmode_t flags,
661                         void *holder)
662 {
663         struct request_queue *q;
664         struct block_device *bdev;
665         struct buffer_head *bh;
666         struct btrfs_super_block *disk_super;
667         u64 devid;
668         int ret;
669
670         if (device->bdev)
671                 return -EINVAL;
672         if (!device->name)
673                 return -EINVAL;
674
675         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
676                                     &bdev, &bh);
677         if (ret)
678                 return ret;
679
680         disk_super = (struct btrfs_super_block *)bh->b_data;
681         devid = btrfs_stack_device_id(&disk_super->dev_item);
682         if (devid != device->devid)
683                 goto error_brelse;
684
685         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
686                 goto error_brelse;
687
688         device->generation = btrfs_super_generation(disk_super);
689
690         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
691                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
692                 fs_devices->seeding = 1;
693         } else {
694                 if (bdev_read_only(bdev))
695                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
696                 else
697                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
698         }
699
700         q = bdev_get_queue(bdev);
701         if (!blk_queue_nonrot(q))
702                 fs_devices->rotating = 1;
703
704         device->bdev = bdev;
705         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
706         device->mode = flags;
707
708         fs_devices->open_devices++;
709         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
710             device->devid != BTRFS_DEV_REPLACE_DEVID) {
711                 fs_devices->rw_devices++;
712                 list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
713         }
714         brelse(bh);
715
716         return 0;
717
718 error_brelse:
719         brelse(bh);
720         blkdev_put(bdev, flags);
721
722         return -EINVAL;
723 }
724
725 /*
726  * Add new device to list of registered devices
727  *
728  * Returns:
729  * device pointer which was just added or updated when successful
730  * error pointer when failed
731  */
732 static noinline struct btrfs_device *device_list_add(const char *path,
733                            struct btrfs_super_block *disk_super)
734 {
735         struct btrfs_device *device;
736         struct btrfs_fs_devices *fs_devices;
737         struct rcu_string *name;
738         u64 found_transid = btrfs_super_generation(disk_super);
739         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
740
741         fs_devices = find_fsid(disk_super->fsid);
742         if (!fs_devices) {
743                 fs_devices = alloc_fs_devices(disk_super->fsid);
744                 if (IS_ERR(fs_devices))
745                         return ERR_CAST(fs_devices);
746
747                 list_add(&fs_devices->list, &fs_uuids);
748
749                 device = NULL;
750         } else {
751                 device = find_device(fs_devices, devid,
752                                 disk_super->dev_item.uuid);
753         }
754
755         if (!device) {
756                 if (fs_devices->opened)
757                         return ERR_PTR(-EBUSY);
758
759                 device = btrfs_alloc_device(NULL, &devid,
760                                             disk_super->dev_item.uuid);
761                 if (IS_ERR(device)) {
762                         /* we can safely leave the fs_devices entry around */
763                         return device;
764                 }
765
766                 name = rcu_string_strdup(path, GFP_NOFS);
767                 if (!name) {
768                         free_device(device);
769                         return ERR_PTR(-ENOMEM);
770                 }
771                 rcu_assign_pointer(device->name, name);
772
773                 mutex_lock(&fs_devices->device_list_mutex);
774                 list_add_rcu(&device->dev_list, &fs_devices->devices);
775                 fs_devices->num_devices++;
776                 mutex_unlock(&fs_devices->device_list_mutex);
777
778                 device->fs_devices = fs_devices;
779                 btrfs_free_stale_devices(path, device);
780
781                 if (disk_super->label[0])
782                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
783                                 disk_super->label, devid, found_transid, path);
784                 else
785                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
786                                 disk_super->fsid, devid, found_transid, path);
787
788         } else if (!device->name || strcmp(device->name->str, path)) {
789                 /*
790                  * When FS is already mounted.
791                  * 1. If you are here and if the device->name is NULL that
792                  *    means this device was missing at time of FS mount.
793                  * 2. If you are here and if the device->name is different
794                  *    from 'path' that means either
795                  *      a. The same device disappeared and reappeared with
796                  *         different name. or
797                  *      b. The missing-disk-which-was-replaced, has
798                  *         reappeared now.
799                  *
800                  * We must allow 1 and 2a above. But 2b would be a spurious
801                  * and unintentional.
802                  *
803                  * Further in case of 1 and 2a above, the disk at 'path'
804                  * would have missed some transaction when it was away and
805                  * in case of 2a the stale bdev has to be updated as well.
806                  * 2b must not be allowed at all time.
807                  */
808
809                 /*
810                  * For now, we do allow update to btrfs_fs_device through the
811                  * btrfs dev scan cli after FS has been mounted.  We're still
812                  * tracking a problem where systems fail mount by subvolume id
813                  * when we reject replacement on a mounted FS.
814                  */
815                 if (!fs_devices->opened && found_transid < device->generation) {
816                         /*
817                          * That is if the FS is _not_ mounted and if you
818                          * are here, that means there is more than one
819                          * disk with same uuid and devid.We keep the one
820                          * with larger generation number or the last-in if
821                          * generation are equal.
822                          */
823                         return ERR_PTR(-EEXIST);
824                 }
825
826                 name = rcu_string_strdup(path, GFP_NOFS);
827                 if (!name)
828                         return ERR_PTR(-ENOMEM);
829                 rcu_string_free(device->name);
830                 rcu_assign_pointer(device->name, name);
831                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
832                         fs_devices->missing_devices--;
833                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
834                 }
835         }
836
837         /*
838          * Unmount does not free the btrfs_device struct but would zero
839          * generation along with most of the other members. So just update
840          * it back. We need it to pick the disk with largest generation
841          * (as above).
842          */
843         if (!fs_devices->opened)
844                 device->generation = found_transid;
845
846         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
847
848         return device;
849 }
850
851 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
852 {
853         struct btrfs_fs_devices *fs_devices;
854         struct btrfs_device *device;
855         struct btrfs_device *orig_dev;
856
857         fs_devices = alloc_fs_devices(orig->fsid);
858         if (IS_ERR(fs_devices))
859                 return fs_devices;
860
861         mutex_lock(&orig->device_list_mutex);
862         fs_devices->total_devices = orig->total_devices;
863
864         /* We have held the volume lock, it is safe to get the devices. */
865         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
866                 struct rcu_string *name;
867
868                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
869                                             orig_dev->uuid);
870                 if (IS_ERR(device))
871                         goto error;
872
873                 /*
874                  * This is ok to do without rcu read locked because we hold the
875                  * uuid mutex so nothing we touch in here is going to disappear.
876                  */
877                 if (orig_dev->name) {
878                         name = rcu_string_strdup(orig_dev->name->str,
879                                         GFP_KERNEL);
880                         if (!name) {
881                                 free_device(device);
882                                 goto error;
883                         }
884                         rcu_assign_pointer(device->name, name);
885                 }
886
887                 list_add(&device->dev_list, &fs_devices->devices);
888                 device->fs_devices = fs_devices;
889                 fs_devices->num_devices++;
890         }
891         mutex_unlock(&orig->device_list_mutex);
892         return fs_devices;
893 error:
894         mutex_unlock(&orig->device_list_mutex);
895         free_fs_devices(fs_devices);
896         return ERR_PTR(-ENOMEM);
897 }
898
899 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
900 {
901         struct btrfs_device *device, *next;
902         struct btrfs_device *latest_dev = NULL;
903
904         mutex_lock(&uuid_mutex);
905 again:
906         /* This is the initialized path, it is safe to release the devices. */
907         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
908                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
909                                                         &device->dev_state)) {
910                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
911                              &device->dev_state) &&
912                              (!latest_dev ||
913                               device->generation > latest_dev->generation)) {
914                                 latest_dev = device;
915                         }
916                         continue;
917                 }
918
919                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
920                         /*
921                          * In the first step, keep the device which has
922                          * the correct fsid and the devid that is used
923                          * for the dev_replace procedure.
924                          * In the second step, the dev_replace state is
925                          * read from the device tree and it is known
926                          * whether the procedure is really active or
927                          * not, which means whether this device is
928                          * used or whether it should be removed.
929                          */
930                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
931                                                   &device->dev_state)) {
932                                 continue;
933                         }
934                 }
935                 if (device->bdev) {
936                         blkdev_put(device->bdev, device->mode);
937                         device->bdev = NULL;
938                         fs_devices->open_devices--;
939                 }
940                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
941                         list_del_init(&device->dev_alloc_list);
942                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
943                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
944                                       &device->dev_state))
945                                 fs_devices->rw_devices--;
946                 }
947                 list_del_init(&device->dev_list);
948                 fs_devices->num_devices--;
949                 free_device(device);
950         }
951
952         if (fs_devices->seed) {
953                 fs_devices = fs_devices->seed;
954                 goto again;
955         }
956
957         fs_devices->latest_bdev = latest_dev->bdev;
958
959         mutex_unlock(&uuid_mutex);
960 }
961
962 static void free_device_rcu(struct rcu_head *head)
963 {
964         struct btrfs_device *device;
965
966         device = container_of(head, struct btrfs_device, rcu);
967         free_device(device);
968 }
969
970 static void btrfs_close_bdev(struct btrfs_device *device)
971 {
972         if (!device->bdev)
973                 return;
974
975         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
976                 sync_blockdev(device->bdev);
977                 invalidate_bdev(device->bdev);
978         }
979
980         blkdev_put(device->bdev, device->mode);
981 }
982
983 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
984 {
985         struct btrfs_fs_devices *fs_devices = device->fs_devices;
986         struct btrfs_device *new_device;
987         struct rcu_string *name;
988
989         if (device->bdev)
990                 fs_devices->open_devices--;
991
992         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
993             device->devid != BTRFS_DEV_REPLACE_DEVID) {
994                 list_del_init(&device->dev_alloc_list);
995                 fs_devices->rw_devices--;
996         }
997
998         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
999                 fs_devices->missing_devices--;
1000
1001         new_device = btrfs_alloc_device(NULL, &device->devid,
1002                                         device->uuid);
1003         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1004
1005         /* Safe because we are under uuid_mutex */
1006         if (device->name) {
1007                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1008                 BUG_ON(!name); /* -ENOMEM */
1009                 rcu_assign_pointer(new_device->name, name);
1010         }
1011
1012         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1013         new_device->fs_devices = device->fs_devices;
1014 }
1015
1016 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1017 {
1018         struct btrfs_device *device, *tmp;
1019         struct list_head pending_put;
1020
1021         INIT_LIST_HEAD(&pending_put);
1022
1023         if (--fs_devices->opened > 0)
1024                 return 0;
1025
1026         mutex_lock(&fs_devices->device_list_mutex);
1027         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1028                 btrfs_prepare_close_one_device(device);
1029                 list_add(&device->dev_list, &pending_put);
1030         }
1031         mutex_unlock(&fs_devices->device_list_mutex);
1032
1033         /*
1034          * btrfs_show_devname() is using the device_list_mutex,
1035          * sometimes call to blkdev_put() leads vfs calling
1036          * into this func. So do put outside of device_list_mutex,
1037          * as of now.
1038          */
1039         while (!list_empty(&pending_put)) {
1040                 device = list_first_entry(&pending_put,
1041                                 struct btrfs_device, dev_list);
1042                 list_del(&device->dev_list);
1043                 btrfs_close_bdev(device);
1044                 call_rcu(&device->rcu, free_device_rcu);
1045         }
1046
1047         WARN_ON(fs_devices->open_devices);
1048         WARN_ON(fs_devices->rw_devices);
1049         fs_devices->opened = 0;
1050         fs_devices->seeding = 0;
1051
1052         return 0;
1053 }
1054
1055 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1056 {
1057         struct btrfs_fs_devices *seed_devices = NULL;
1058         int ret;
1059
1060         mutex_lock(&uuid_mutex);
1061         ret = __btrfs_close_devices(fs_devices);
1062         if (!fs_devices->opened) {
1063                 seed_devices = fs_devices->seed;
1064                 fs_devices->seed = NULL;
1065         }
1066         mutex_unlock(&uuid_mutex);
1067
1068         while (seed_devices) {
1069                 fs_devices = seed_devices;
1070                 seed_devices = fs_devices->seed;
1071                 __btrfs_close_devices(fs_devices);
1072                 free_fs_devices(fs_devices);
1073         }
1074         return ret;
1075 }
1076
1077 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1078                                 fmode_t flags, void *holder)
1079 {
1080         struct list_head *head = &fs_devices->devices;
1081         struct btrfs_device *device;
1082         struct btrfs_device *latest_dev = NULL;
1083         int ret = 0;
1084
1085         flags |= FMODE_EXCL;
1086
1087         list_for_each_entry(device, head, dev_list) {
1088                 /* Just open everything we can; ignore failures here */
1089                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1090                         continue;
1091
1092                 if (!latest_dev ||
1093                     device->generation > latest_dev->generation)
1094                         latest_dev = device;
1095         }
1096         if (fs_devices->open_devices == 0) {
1097                 ret = -EINVAL;
1098                 goto out;
1099         }
1100         fs_devices->opened = 1;
1101         fs_devices->latest_bdev = latest_dev->bdev;
1102         fs_devices->total_rw_bytes = 0;
1103 out:
1104         return ret;
1105 }
1106
1107 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1108 {
1109         struct btrfs_device *dev1, *dev2;
1110
1111         dev1 = list_entry(a, struct btrfs_device, dev_list);
1112         dev2 = list_entry(b, struct btrfs_device, dev_list);
1113
1114         if (dev1->devid < dev2->devid)
1115                 return -1;
1116         else if (dev1->devid > dev2->devid)
1117                 return 1;
1118         return 0;
1119 }
1120
1121 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1122                        fmode_t flags, void *holder)
1123 {
1124         int ret;
1125
1126         mutex_lock(&uuid_mutex);
1127         if (fs_devices->opened) {
1128                 fs_devices->opened++;
1129                 ret = 0;
1130         } else {
1131                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1132                 ret = __btrfs_open_devices(fs_devices, flags, holder);
1133         }
1134         mutex_unlock(&uuid_mutex);
1135         return ret;
1136 }
1137
1138 static void btrfs_release_disk_super(struct page *page)
1139 {
1140         kunmap(page);
1141         put_page(page);
1142 }
1143
1144 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1145                                  struct page **page,
1146                                  struct btrfs_super_block **disk_super)
1147 {
1148         void *p;
1149         pgoff_t index;
1150
1151         /* make sure our super fits in the device */
1152         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1153                 return 1;
1154
1155         /* make sure our super fits in the page */
1156         if (sizeof(**disk_super) > PAGE_SIZE)
1157                 return 1;
1158
1159         /* make sure our super doesn't straddle pages on disk */
1160         index = bytenr >> PAGE_SHIFT;
1161         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1162                 return 1;
1163
1164         /* pull in the page with our super */
1165         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1166                                    index, GFP_KERNEL);
1167
1168         if (IS_ERR_OR_NULL(*page))
1169                 return 1;
1170
1171         p = kmap(*page);
1172
1173         /* align our pointer to the offset of the super block */
1174         *disk_super = p + (bytenr & ~PAGE_MASK);
1175
1176         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1177             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1178                 btrfs_release_disk_super(*page);
1179                 return 1;
1180         }
1181
1182         if ((*disk_super)->label[0] &&
1183                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1184                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1185
1186         return 0;
1187 }
1188
1189 /*
1190  * Look for a btrfs signature on a device. This may be called out of the mount path
1191  * and we are not allowed to call set_blocksize during the scan. The superblock
1192  * is read via pagecache
1193  */
1194 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1195                           struct btrfs_fs_devices **fs_devices_ret)
1196 {
1197         struct btrfs_super_block *disk_super;
1198         struct btrfs_device *device;
1199         struct block_device *bdev;
1200         struct page *page;
1201         int ret = 0;
1202         u64 bytenr;
1203
1204         /*
1205          * we would like to check all the supers, but that would make
1206          * a btrfs mount succeed after a mkfs from a different FS.
1207          * So, we need to add a special mount option to scan for
1208          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1209          */
1210         bytenr = btrfs_sb_offset(0);
1211         flags |= FMODE_EXCL;
1212         mutex_lock(&uuid_mutex);
1213
1214         bdev = blkdev_get_by_path(path, flags, holder);
1215         if (IS_ERR(bdev)) {
1216                 ret = PTR_ERR(bdev);
1217                 goto error;
1218         }
1219
1220         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1221                 ret = -EINVAL;
1222                 goto error_bdev_put;
1223         }
1224
1225         device = device_list_add(path, disk_super);
1226         if (IS_ERR(device))
1227                 ret = PTR_ERR(device);
1228         else
1229                 *fs_devices_ret = device->fs_devices;
1230
1231         btrfs_release_disk_super(page);
1232
1233 error_bdev_put:
1234         blkdev_put(bdev, flags);
1235 error:
1236         mutex_unlock(&uuid_mutex);
1237         return ret;
1238 }
1239
1240 /* helper to account the used device space in the range */
1241 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1242                                    u64 end, u64 *length)
1243 {
1244         struct btrfs_key key;
1245         struct btrfs_root *root = device->fs_info->dev_root;
1246         struct btrfs_dev_extent *dev_extent;
1247         struct btrfs_path *path;
1248         u64 extent_end;
1249         int ret;
1250         int slot;
1251         struct extent_buffer *l;
1252
1253         *length = 0;
1254
1255         if (start >= device->total_bytes ||
1256                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1257                 return 0;
1258
1259         path = btrfs_alloc_path();
1260         if (!path)
1261                 return -ENOMEM;
1262         path->reada = READA_FORWARD;
1263
1264         key.objectid = device->devid;
1265         key.offset = start;
1266         key.type = BTRFS_DEV_EXTENT_KEY;
1267
1268         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1269         if (ret < 0)
1270                 goto out;
1271         if (ret > 0) {
1272                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1273                 if (ret < 0)
1274                         goto out;
1275         }
1276
1277         while (1) {
1278                 l = path->nodes[0];
1279                 slot = path->slots[0];
1280                 if (slot >= btrfs_header_nritems(l)) {
1281                         ret = btrfs_next_leaf(root, path);
1282                         if (ret == 0)
1283                                 continue;
1284                         if (ret < 0)
1285                                 goto out;
1286
1287                         break;
1288                 }
1289                 btrfs_item_key_to_cpu(l, &key, slot);
1290
1291                 if (key.objectid < device->devid)
1292                         goto next;
1293
1294                 if (key.objectid > device->devid)
1295                         break;
1296
1297                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1298                         goto next;
1299
1300                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1301                 extent_end = key.offset + btrfs_dev_extent_length(l,
1302                                                                   dev_extent);
1303                 if (key.offset <= start && extent_end > end) {
1304                         *length = end - start + 1;
1305                         break;
1306                 } else if (key.offset <= start && extent_end > start)
1307                         *length += extent_end - start;
1308                 else if (key.offset > start && extent_end <= end)
1309                         *length += extent_end - key.offset;
1310                 else if (key.offset > start && key.offset <= end) {
1311                         *length += end - key.offset + 1;
1312                         break;
1313                 } else if (key.offset > end)
1314                         break;
1315
1316 next:
1317                 path->slots[0]++;
1318         }
1319         ret = 0;
1320 out:
1321         btrfs_free_path(path);
1322         return ret;
1323 }
1324
1325 static int contains_pending_extent(struct btrfs_transaction *transaction,
1326                                    struct btrfs_device *device,
1327                                    u64 *start, u64 len)
1328 {
1329         struct btrfs_fs_info *fs_info = device->fs_info;
1330         struct extent_map *em;
1331         struct list_head *search_list = &fs_info->pinned_chunks;
1332         int ret = 0;
1333         u64 physical_start = *start;
1334
1335         if (transaction)
1336                 search_list = &transaction->pending_chunks;
1337 again:
1338         list_for_each_entry(em, search_list, list) {
1339                 struct map_lookup *map;
1340                 int i;
1341
1342                 map = em->map_lookup;
1343                 for (i = 0; i < map->num_stripes; i++) {
1344                         u64 end;
1345
1346                         if (map->stripes[i].dev != device)
1347                                 continue;
1348                         if (map->stripes[i].physical >= physical_start + len ||
1349                             map->stripes[i].physical + em->orig_block_len <=
1350                             physical_start)
1351                                 continue;
1352                         /*
1353                          * Make sure that while processing the pinned list we do
1354                          * not override our *start with a lower value, because
1355                          * we can have pinned chunks that fall within this
1356                          * device hole and that have lower physical addresses
1357                          * than the pending chunks we processed before. If we
1358                          * do not take this special care we can end up getting
1359                          * 2 pending chunks that start at the same physical
1360                          * device offsets because the end offset of a pinned
1361                          * chunk can be equal to the start offset of some
1362                          * pending chunk.
1363                          */
1364                         end = map->stripes[i].physical + em->orig_block_len;
1365                         if (end > *start) {
1366                                 *start = end;
1367                                 ret = 1;
1368                         }
1369                 }
1370         }
1371         if (search_list != &fs_info->pinned_chunks) {
1372                 search_list = &fs_info->pinned_chunks;
1373                 goto again;
1374         }
1375
1376         return ret;
1377 }
1378
1379
1380 /*
1381  * find_free_dev_extent_start - find free space in the specified device
1382  * @device:       the device which we search the free space in
1383  * @num_bytes:    the size of the free space that we need
1384  * @search_start: the position from which to begin the search
1385  * @start:        store the start of the free space.
1386  * @len:          the size of the free space. that we find, or the size
1387  *                of the max free space if we don't find suitable free space
1388  *
1389  * this uses a pretty simple search, the expectation is that it is
1390  * called very infrequently and that a given device has a small number
1391  * of extents
1392  *
1393  * @start is used to store the start of the free space if we find. But if we
1394  * don't find suitable free space, it will be used to store the start position
1395  * of the max free space.
1396  *
1397  * @len is used to store the size of the free space that we find.
1398  * But if we don't find suitable free space, it is used to store the size of
1399  * the max free space.
1400  */
1401 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1402                                struct btrfs_device *device, u64 num_bytes,
1403                                u64 search_start, u64 *start, u64 *len)
1404 {
1405         struct btrfs_fs_info *fs_info = device->fs_info;
1406         struct btrfs_root *root = fs_info->dev_root;
1407         struct btrfs_key key;
1408         struct btrfs_dev_extent *dev_extent;
1409         struct btrfs_path *path;
1410         u64 hole_size;
1411         u64 max_hole_start;
1412         u64 max_hole_size;
1413         u64 extent_end;
1414         u64 search_end = device->total_bytes;
1415         int ret;
1416         int slot;
1417         struct extent_buffer *l;
1418
1419         /*
1420          * We don't want to overwrite the superblock on the drive nor any area
1421          * used by the boot loader (grub for example), so we make sure to start
1422          * at an offset of at least 1MB.
1423          */
1424         search_start = max_t(u64, search_start, SZ_1M);
1425
1426         path = btrfs_alloc_path();
1427         if (!path)
1428                 return -ENOMEM;
1429
1430         max_hole_start = search_start;
1431         max_hole_size = 0;
1432
1433 again:
1434         if (search_start >= search_end ||
1435                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1436                 ret = -ENOSPC;
1437                 goto out;
1438         }
1439
1440         path->reada = READA_FORWARD;
1441         path->search_commit_root = 1;
1442         path->skip_locking = 1;
1443
1444         key.objectid = device->devid;
1445         key.offset = search_start;
1446         key.type = BTRFS_DEV_EXTENT_KEY;
1447
1448         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1449         if (ret < 0)
1450                 goto out;
1451         if (ret > 0) {
1452                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1453                 if (ret < 0)
1454                         goto out;
1455         }
1456
1457         while (1) {
1458                 l = path->nodes[0];
1459                 slot = path->slots[0];
1460                 if (slot >= btrfs_header_nritems(l)) {
1461                         ret = btrfs_next_leaf(root, path);
1462                         if (ret == 0)
1463                                 continue;
1464                         if (ret < 0)
1465                                 goto out;
1466
1467                         break;
1468                 }
1469                 btrfs_item_key_to_cpu(l, &key, slot);
1470
1471                 if (key.objectid < device->devid)
1472                         goto next;
1473
1474                 if (key.objectid > device->devid)
1475                         break;
1476
1477                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1478                         goto next;
1479
1480                 if (key.offset > search_start) {
1481                         hole_size = key.offset - search_start;
1482
1483                         /*
1484                          * Have to check before we set max_hole_start, otherwise
1485                          * we could end up sending back this offset anyway.
1486                          */
1487                         if (contains_pending_extent(transaction, device,
1488                                                     &search_start,
1489                                                     hole_size)) {
1490                                 if (key.offset >= search_start) {
1491                                         hole_size = key.offset - search_start;
1492                                 } else {
1493                                         WARN_ON_ONCE(1);
1494                                         hole_size = 0;
1495                                 }
1496                         }
1497
1498                         if (hole_size > max_hole_size) {
1499                                 max_hole_start = search_start;
1500                                 max_hole_size = hole_size;
1501                         }
1502
1503                         /*
1504                          * If this free space is greater than which we need,
1505                          * it must be the max free space that we have found
1506                          * until now, so max_hole_start must point to the start
1507                          * of this free space and the length of this free space
1508                          * is stored in max_hole_size. Thus, we return
1509                          * max_hole_start and max_hole_size and go back to the
1510                          * caller.
1511                          */
1512                         if (hole_size >= num_bytes) {
1513                                 ret = 0;
1514                                 goto out;
1515                         }
1516                 }
1517
1518                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1519                 extent_end = key.offset + btrfs_dev_extent_length(l,
1520                                                                   dev_extent);
1521                 if (extent_end > search_start)
1522                         search_start = extent_end;
1523 next:
1524                 path->slots[0]++;
1525                 cond_resched();
1526         }
1527
1528         /*
1529          * At this point, search_start should be the end of
1530          * allocated dev extents, and when shrinking the device,
1531          * search_end may be smaller than search_start.
1532          */
1533         if (search_end > search_start) {
1534                 hole_size = search_end - search_start;
1535
1536                 if (contains_pending_extent(transaction, device, &search_start,
1537                                             hole_size)) {
1538                         btrfs_release_path(path);
1539                         goto again;
1540                 }
1541
1542                 if (hole_size > max_hole_size) {
1543                         max_hole_start = search_start;
1544                         max_hole_size = hole_size;
1545                 }
1546         }
1547
1548         /* See above. */
1549         if (max_hole_size < num_bytes)
1550                 ret = -ENOSPC;
1551         else
1552                 ret = 0;
1553
1554 out:
1555         btrfs_free_path(path);
1556         *start = max_hole_start;
1557         if (len)
1558                 *len = max_hole_size;
1559         return ret;
1560 }
1561
1562 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1563                          struct btrfs_device *device, u64 num_bytes,
1564                          u64 *start, u64 *len)
1565 {
1566         /* FIXME use last free of some kind */
1567         return find_free_dev_extent_start(trans->transaction, device,
1568                                           num_bytes, 0, start, len);
1569 }
1570
1571 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1572                           struct btrfs_device *device,
1573                           u64 start, u64 *dev_extent_len)
1574 {
1575         struct btrfs_fs_info *fs_info = device->fs_info;
1576         struct btrfs_root *root = fs_info->dev_root;
1577         int ret;
1578         struct btrfs_path *path;
1579         struct btrfs_key key;
1580         struct btrfs_key found_key;
1581         struct extent_buffer *leaf = NULL;
1582         struct btrfs_dev_extent *extent = NULL;
1583
1584         path = btrfs_alloc_path();
1585         if (!path)
1586                 return -ENOMEM;
1587
1588         key.objectid = device->devid;
1589         key.offset = start;
1590         key.type = BTRFS_DEV_EXTENT_KEY;
1591 again:
1592         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1593         if (ret > 0) {
1594                 ret = btrfs_previous_item(root, path, key.objectid,
1595                                           BTRFS_DEV_EXTENT_KEY);
1596                 if (ret)
1597                         goto out;
1598                 leaf = path->nodes[0];
1599                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1600                 extent = btrfs_item_ptr(leaf, path->slots[0],
1601                                         struct btrfs_dev_extent);
1602                 BUG_ON(found_key.offset > start || found_key.offset +
1603                        btrfs_dev_extent_length(leaf, extent) < start);
1604                 key = found_key;
1605                 btrfs_release_path(path);
1606                 goto again;
1607         } else if (ret == 0) {
1608                 leaf = path->nodes[0];
1609                 extent = btrfs_item_ptr(leaf, path->slots[0],
1610                                         struct btrfs_dev_extent);
1611         } else {
1612                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1613                 goto out;
1614         }
1615
1616         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1617
1618         ret = btrfs_del_item(trans, root, path);
1619         if (ret) {
1620                 btrfs_handle_fs_error(fs_info, ret,
1621                                       "Failed to remove dev extent item");
1622         } else {
1623                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1624         }
1625 out:
1626         btrfs_free_path(path);
1627         return ret;
1628 }
1629
1630 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1631                                   struct btrfs_device *device,
1632                                   u64 chunk_offset, u64 start, u64 num_bytes)
1633 {
1634         int ret;
1635         struct btrfs_path *path;
1636         struct btrfs_fs_info *fs_info = device->fs_info;
1637         struct btrfs_root *root = fs_info->dev_root;
1638         struct btrfs_dev_extent *extent;
1639         struct extent_buffer *leaf;
1640         struct btrfs_key key;
1641
1642         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1643         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1644         path = btrfs_alloc_path();
1645         if (!path)
1646                 return -ENOMEM;
1647
1648         key.objectid = device->devid;
1649         key.offset = start;
1650         key.type = BTRFS_DEV_EXTENT_KEY;
1651         ret = btrfs_insert_empty_item(trans, root, path, &key,
1652                                       sizeof(*extent));
1653         if (ret)
1654                 goto out;
1655
1656         leaf = path->nodes[0];
1657         extent = btrfs_item_ptr(leaf, path->slots[0],
1658                                 struct btrfs_dev_extent);
1659         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1660                                         BTRFS_CHUNK_TREE_OBJECTID);
1661         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1662                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1663         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1664
1665         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1666         btrfs_mark_buffer_dirty(leaf);
1667 out:
1668         btrfs_free_path(path);
1669         return ret;
1670 }
1671
1672 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1673 {
1674         struct extent_map_tree *em_tree;
1675         struct extent_map *em;
1676         struct rb_node *n;
1677         u64 ret = 0;
1678
1679         em_tree = &fs_info->mapping_tree.map_tree;
1680         read_lock(&em_tree->lock);
1681         n = rb_last(&em_tree->map);
1682         if (n) {
1683                 em = rb_entry(n, struct extent_map, rb_node);
1684                 ret = em->start + em->len;
1685         }
1686         read_unlock(&em_tree->lock);
1687
1688         return ret;
1689 }
1690
1691 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1692                                     u64 *devid_ret)
1693 {
1694         int ret;
1695         struct btrfs_key key;
1696         struct btrfs_key found_key;
1697         struct btrfs_path *path;
1698
1699         path = btrfs_alloc_path();
1700         if (!path)
1701                 return -ENOMEM;
1702
1703         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1704         key.type = BTRFS_DEV_ITEM_KEY;
1705         key.offset = (u64)-1;
1706
1707         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1708         if (ret < 0)
1709                 goto error;
1710
1711         BUG_ON(ret == 0); /* Corruption */
1712
1713         ret = btrfs_previous_item(fs_info->chunk_root, path,
1714                                   BTRFS_DEV_ITEMS_OBJECTID,
1715                                   BTRFS_DEV_ITEM_KEY);
1716         if (ret) {
1717                 *devid_ret = 1;
1718         } else {
1719                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1720                                       path->slots[0]);
1721                 *devid_ret = found_key.offset + 1;
1722         }
1723         ret = 0;
1724 error:
1725         btrfs_free_path(path);
1726         return ret;
1727 }
1728
1729 /*
1730  * the device information is stored in the chunk root
1731  * the btrfs_device struct should be fully filled in
1732  */
1733 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1734                             struct btrfs_fs_info *fs_info,
1735                             struct btrfs_device *device)
1736 {
1737         struct btrfs_root *root = fs_info->chunk_root;
1738         int ret;
1739         struct btrfs_path *path;
1740         struct btrfs_dev_item *dev_item;
1741         struct extent_buffer *leaf;
1742         struct btrfs_key key;
1743         unsigned long ptr;
1744
1745         path = btrfs_alloc_path();
1746         if (!path)
1747                 return -ENOMEM;
1748
1749         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1750         key.type = BTRFS_DEV_ITEM_KEY;
1751         key.offset = device->devid;
1752
1753         ret = btrfs_insert_empty_item(trans, root, path, &key,
1754                                       sizeof(*dev_item));
1755         if (ret)
1756                 goto out;
1757
1758         leaf = path->nodes[0];
1759         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1760
1761         btrfs_set_device_id(leaf, dev_item, device->devid);
1762         btrfs_set_device_generation(leaf, dev_item, 0);
1763         btrfs_set_device_type(leaf, dev_item, device->type);
1764         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1765         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1766         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1767         btrfs_set_device_total_bytes(leaf, dev_item,
1768                                      btrfs_device_get_disk_total_bytes(device));
1769         btrfs_set_device_bytes_used(leaf, dev_item,
1770                                     btrfs_device_get_bytes_used(device));
1771         btrfs_set_device_group(leaf, dev_item, 0);
1772         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1773         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1774         btrfs_set_device_start_offset(leaf, dev_item, 0);
1775
1776         ptr = btrfs_device_uuid(dev_item);
1777         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1778         ptr = btrfs_device_fsid(dev_item);
1779         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1780         btrfs_mark_buffer_dirty(leaf);
1781
1782         ret = 0;
1783 out:
1784         btrfs_free_path(path);
1785         return ret;
1786 }
1787
1788 /*
1789  * Function to update ctime/mtime for a given device path.
1790  * Mainly used for ctime/mtime based probe like libblkid.
1791  */
1792 static void update_dev_time(const char *path_name)
1793 {
1794         struct file *filp;
1795
1796         filp = filp_open(path_name, O_RDWR, 0);
1797         if (IS_ERR(filp))
1798                 return;
1799         file_update_time(filp);
1800         filp_close(filp, NULL);
1801 }
1802
1803 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1804                              struct btrfs_device *device)
1805 {
1806         struct btrfs_root *root = fs_info->chunk_root;
1807         int ret;
1808         struct btrfs_path *path;
1809         struct btrfs_key key;
1810         struct btrfs_trans_handle *trans;
1811
1812         path = btrfs_alloc_path();
1813         if (!path)
1814                 return -ENOMEM;
1815
1816         trans = btrfs_start_transaction(root, 0);
1817         if (IS_ERR(trans)) {
1818                 btrfs_free_path(path);
1819                 return PTR_ERR(trans);
1820         }
1821         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1822         key.type = BTRFS_DEV_ITEM_KEY;
1823         key.offset = device->devid;
1824
1825         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1826         if (ret) {
1827                 if (ret > 0)
1828                         ret = -ENOENT;
1829                 btrfs_abort_transaction(trans, ret);
1830                 btrfs_end_transaction(trans);
1831                 goto out;
1832         }
1833
1834         ret = btrfs_del_item(trans, root, path);
1835         if (ret) {
1836                 btrfs_abort_transaction(trans, ret);
1837                 btrfs_end_transaction(trans);
1838         }
1839
1840 out:
1841         btrfs_free_path(path);
1842         if (!ret)
1843                 ret = btrfs_commit_transaction(trans);
1844         return ret;
1845 }
1846
1847 /*
1848  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1849  * filesystem. It's up to the caller to adjust that number regarding eg. device
1850  * replace.
1851  */
1852 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1853                 u64 num_devices)
1854 {
1855         u64 all_avail;
1856         unsigned seq;
1857         int i;
1858
1859         do {
1860                 seq = read_seqbegin(&fs_info->profiles_lock);
1861
1862                 all_avail = fs_info->avail_data_alloc_bits |
1863                             fs_info->avail_system_alloc_bits |
1864                             fs_info->avail_metadata_alloc_bits;
1865         } while (read_seqretry(&fs_info->profiles_lock, seq));
1866
1867         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1868                 if (!(all_avail & btrfs_raid_group[i]))
1869                         continue;
1870
1871                 if (num_devices < btrfs_raid_array[i].devs_min) {
1872                         int ret = btrfs_raid_mindev_error[i];
1873
1874                         if (ret)
1875                                 return ret;
1876                 }
1877         }
1878
1879         return 0;
1880 }
1881
1882 static struct btrfs_device * btrfs_find_next_active_device(
1883                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1884 {
1885         struct btrfs_device *next_device;
1886
1887         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1888                 if (next_device != device &&
1889                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1890                     && next_device->bdev)
1891                         return next_device;
1892         }
1893
1894         return NULL;
1895 }
1896
1897 /*
1898  * Helper function to check if the given device is part of s_bdev / latest_bdev
1899  * and replace it with the provided or the next active device, in the context
1900  * where this function called, there should be always be another device (or
1901  * this_dev) which is active.
1902  */
1903 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1904                 struct btrfs_device *device, struct btrfs_device *this_dev)
1905 {
1906         struct btrfs_device *next_device;
1907
1908         if (this_dev)
1909                 next_device = this_dev;
1910         else
1911                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1912                                                                 device);
1913         ASSERT(next_device);
1914
1915         if (fs_info->sb->s_bdev &&
1916                         (fs_info->sb->s_bdev == device->bdev))
1917                 fs_info->sb->s_bdev = next_device->bdev;
1918
1919         if (fs_info->fs_devices->latest_bdev == device->bdev)
1920                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1921 }
1922
1923 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1924                 u64 devid)
1925 {
1926         struct btrfs_device *device;
1927         struct btrfs_fs_devices *cur_devices;
1928         u64 num_devices;
1929         int ret = 0;
1930
1931         mutex_lock(&fs_info->volume_mutex);
1932         mutex_lock(&uuid_mutex);
1933
1934         num_devices = fs_info->fs_devices->num_devices;
1935         btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
1936         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1937                 WARN_ON(num_devices < 1);
1938                 num_devices--;
1939         }
1940         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
1941
1942         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1943         if (ret)
1944                 goto out;
1945
1946         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1947                                            &device);
1948         if (ret)
1949                 goto out;
1950
1951         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1952                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1953                 goto out;
1954         }
1955
1956         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1957             fs_info->fs_devices->rw_devices == 1) {
1958                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1959                 goto out;
1960         }
1961
1962         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1963                 mutex_lock(&fs_info->chunk_mutex);
1964                 list_del_init(&device->dev_alloc_list);
1965                 device->fs_devices->rw_devices--;
1966                 mutex_unlock(&fs_info->chunk_mutex);
1967         }
1968
1969         mutex_unlock(&uuid_mutex);
1970         ret = btrfs_shrink_device(device, 0);
1971         mutex_lock(&uuid_mutex);
1972         if (ret)
1973                 goto error_undo;
1974
1975         /*
1976          * TODO: the superblock still includes this device in its num_devices
1977          * counter although write_all_supers() is not locked out. This
1978          * could give a filesystem state which requires a degraded mount.
1979          */
1980         ret = btrfs_rm_dev_item(fs_info, device);
1981         if (ret)
1982                 goto error_undo;
1983
1984         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1985         btrfs_scrub_cancel_dev(fs_info, device);
1986
1987         /*
1988          * the device list mutex makes sure that we don't change
1989          * the device list while someone else is writing out all
1990          * the device supers. Whoever is writing all supers, should
1991          * lock the device list mutex before getting the number of
1992          * devices in the super block (super_copy). Conversely,
1993          * whoever updates the number of devices in the super block
1994          * (super_copy) should hold the device list mutex.
1995          */
1996
1997         cur_devices = device->fs_devices;
1998         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1999         list_del_rcu(&device->dev_list);
2000
2001         device->fs_devices->num_devices--;
2002         device->fs_devices->total_devices--;
2003
2004         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2005                 device->fs_devices->missing_devices--;
2006
2007         btrfs_assign_next_active_device(fs_info, device, NULL);
2008
2009         if (device->bdev) {
2010                 device->fs_devices->open_devices--;
2011                 /* remove sysfs entry */
2012                 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2013         }
2014
2015         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2016         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2017         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2018
2019         /*
2020          * at this point, the device is zero sized and detached from
2021          * the devices list.  All that's left is to zero out the old
2022          * supers and free the device.
2023          */
2024         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2025                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2026
2027         btrfs_close_bdev(device);
2028         call_rcu(&device->rcu, free_device_rcu);
2029
2030         if (cur_devices->open_devices == 0) {
2031                 struct btrfs_fs_devices *fs_devices;
2032                 fs_devices = fs_info->fs_devices;
2033                 while (fs_devices) {
2034                         if (fs_devices->seed == cur_devices) {
2035                                 fs_devices->seed = cur_devices->seed;
2036                                 break;
2037                         }
2038                         fs_devices = fs_devices->seed;
2039                 }
2040                 cur_devices->seed = NULL;
2041                 __btrfs_close_devices(cur_devices);
2042                 free_fs_devices(cur_devices);
2043         }
2044
2045 out:
2046         mutex_unlock(&uuid_mutex);
2047         mutex_unlock(&fs_info->volume_mutex);
2048         return ret;
2049
2050 error_undo:
2051         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2052                 mutex_lock(&fs_info->chunk_mutex);
2053                 list_add(&device->dev_alloc_list,
2054                          &fs_info->fs_devices->alloc_list);
2055                 device->fs_devices->rw_devices++;
2056                 mutex_unlock(&fs_info->chunk_mutex);
2057         }
2058         goto out;
2059 }
2060
2061 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
2062                                         struct btrfs_device *srcdev)
2063 {
2064         struct btrfs_fs_devices *fs_devices;
2065
2066         WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
2067
2068         /*
2069          * in case of fs with no seed, srcdev->fs_devices will point
2070          * to fs_devices of fs_info. However when the dev being replaced is
2071          * a seed dev it will point to the seed's local fs_devices. In short
2072          * srcdev will have its correct fs_devices in both the cases.
2073          */
2074         fs_devices = srcdev->fs_devices;
2075
2076         list_del_rcu(&srcdev->dev_list);
2077         list_del(&srcdev->dev_alloc_list);
2078         fs_devices->num_devices--;
2079         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2080                 fs_devices->missing_devices--;
2081
2082         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2083                 fs_devices->rw_devices--;
2084
2085         if (srcdev->bdev)
2086                 fs_devices->open_devices--;
2087 }
2088
2089 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2090                                       struct btrfs_device *srcdev)
2091 {
2092         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2093
2094         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2095                 /* zero out the old super if it is writable */
2096                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2097         }
2098
2099         btrfs_close_bdev(srcdev);
2100         call_rcu(&srcdev->rcu, free_device_rcu);
2101
2102         /* if this is no devs we rather delete the fs_devices */
2103         if (!fs_devices->num_devices) {
2104                 struct btrfs_fs_devices *tmp_fs_devices;
2105
2106                 /*
2107                  * On a mounted FS, num_devices can't be zero unless it's a
2108                  * seed. In case of a seed device being replaced, the replace
2109                  * target added to the sprout FS, so there will be no more
2110                  * device left under the seed FS.
2111                  */
2112                 ASSERT(fs_devices->seeding);
2113
2114                 tmp_fs_devices = fs_info->fs_devices;
2115                 while (tmp_fs_devices) {
2116                         if (tmp_fs_devices->seed == fs_devices) {
2117                                 tmp_fs_devices->seed = fs_devices->seed;
2118                                 break;
2119                         }
2120                         tmp_fs_devices = tmp_fs_devices->seed;
2121                 }
2122                 fs_devices->seed = NULL;
2123                 __btrfs_close_devices(fs_devices);
2124                 free_fs_devices(fs_devices);
2125         }
2126 }
2127
2128 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2129                                       struct btrfs_device *tgtdev)
2130 {
2131         mutex_lock(&uuid_mutex);
2132         WARN_ON(!tgtdev);
2133         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2134
2135         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2136
2137         if (tgtdev->bdev)
2138                 fs_info->fs_devices->open_devices--;
2139
2140         fs_info->fs_devices->num_devices--;
2141
2142         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2143
2144         list_del_rcu(&tgtdev->dev_list);
2145
2146         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2147         mutex_unlock(&uuid_mutex);
2148
2149         /*
2150          * The update_dev_time() with in btrfs_scratch_superblocks()
2151          * may lead to a call to btrfs_show_devname() which will try
2152          * to hold device_list_mutex. And here this device
2153          * is already out of device list, so we don't have to hold
2154          * the device_list_mutex lock.
2155          */
2156         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2157
2158         btrfs_close_bdev(tgtdev);
2159         call_rcu(&tgtdev->rcu, free_device_rcu);
2160 }
2161
2162 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2163                                      const char *device_path,
2164                                      struct btrfs_device **device)
2165 {
2166         int ret = 0;
2167         struct btrfs_super_block *disk_super;
2168         u64 devid;
2169         u8 *dev_uuid;
2170         struct block_device *bdev;
2171         struct buffer_head *bh;
2172
2173         *device = NULL;
2174         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2175                                     fs_info->bdev_holder, 0, &bdev, &bh);
2176         if (ret)
2177                 return ret;
2178         disk_super = (struct btrfs_super_block *)bh->b_data;
2179         devid = btrfs_stack_device_id(&disk_super->dev_item);
2180         dev_uuid = disk_super->dev_item.uuid;
2181         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2182         brelse(bh);
2183         if (!*device)
2184                 ret = -ENOENT;
2185         blkdev_put(bdev, FMODE_READ);
2186         return ret;
2187 }
2188
2189 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2190                                          const char *device_path,
2191                                          struct btrfs_device **device)
2192 {
2193         *device = NULL;
2194         if (strcmp(device_path, "missing") == 0) {
2195                 struct list_head *devices;
2196                 struct btrfs_device *tmp;
2197
2198                 devices = &fs_info->fs_devices->devices;
2199                 /*
2200                  * It is safe to read the devices since the volume_mutex
2201                  * is held by the caller.
2202                  */
2203                 list_for_each_entry(tmp, devices, dev_list) {
2204                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2205                                         &tmp->dev_state) && !tmp->bdev) {
2206                                 *device = tmp;
2207                                 break;
2208                         }
2209                 }
2210
2211                 if (!*device)
2212                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2213
2214                 return 0;
2215         } else {
2216                 return btrfs_find_device_by_path(fs_info, device_path, device);
2217         }
2218 }
2219
2220 /*
2221  * Lookup a device given by device id, or the path if the id is 0.
2222  */
2223 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2224                                  const char *devpath,
2225                                  struct btrfs_device **device)
2226 {
2227         int ret;
2228
2229         if (devid) {
2230                 ret = 0;
2231                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2232                 if (!*device)
2233                         ret = -ENOENT;
2234         } else {
2235                 if (!devpath || !devpath[0])
2236                         return -EINVAL;
2237
2238                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2239                                                            device);
2240         }
2241         return ret;
2242 }
2243
2244 /*
2245  * does all the dirty work required for changing file system's UUID.
2246  */
2247 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2248 {
2249         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2250         struct btrfs_fs_devices *old_devices;
2251         struct btrfs_fs_devices *seed_devices;
2252         struct btrfs_super_block *disk_super = fs_info->super_copy;
2253         struct btrfs_device *device;
2254         u64 super_flags;
2255
2256         BUG_ON(!mutex_is_locked(&uuid_mutex));
2257         if (!fs_devices->seeding)
2258                 return -EINVAL;
2259
2260         seed_devices = alloc_fs_devices(NULL);
2261         if (IS_ERR(seed_devices))
2262                 return PTR_ERR(seed_devices);
2263
2264         old_devices = clone_fs_devices(fs_devices);
2265         if (IS_ERR(old_devices)) {
2266                 kfree(seed_devices);
2267                 return PTR_ERR(old_devices);
2268         }
2269
2270         list_add(&old_devices->list, &fs_uuids);
2271
2272         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2273         seed_devices->opened = 1;
2274         INIT_LIST_HEAD(&seed_devices->devices);
2275         INIT_LIST_HEAD(&seed_devices->alloc_list);
2276         mutex_init(&seed_devices->device_list_mutex);
2277
2278         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2279         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2280                               synchronize_rcu);
2281         list_for_each_entry(device, &seed_devices->devices, dev_list)
2282                 device->fs_devices = seed_devices;
2283
2284         mutex_lock(&fs_info->chunk_mutex);
2285         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2286         mutex_unlock(&fs_info->chunk_mutex);
2287
2288         fs_devices->seeding = 0;
2289         fs_devices->num_devices = 0;
2290         fs_devices->open_devices = 0;
2291         fs_devices->missing_devices = 0;
2292         fs_devices->rotating = 0;
2293         fs_devices->seed = seed_devices;
2294
2295         generate_random_uuid(fs_devices->fsid);
2296         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2297         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2298         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2299
2300         super_flags = btrfs_super_flags(disk_super) &
2301                       ~BTRFS_SUPER_FLAG_SEEDING;
2302         btrfs_set_super_flags(disk_super, super_flags);
2303
2304         return 0;
2305 }
2306
2307 /*
2308  * Store the expected generation for seed devices in device items.
2309  */
2310 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2311                                struct btrfs_fs_info *fs_info)
2312 {
2313         struct btrfs_root *root = fs_info->chunk_root;
2314         struct btrfs_path *path;
2315         struct extent_buffer *leaf;
2316         struct btrfs_dev_item *dev_item;
2317         struct btrfs_device *device;
2318         struct btrfs_key key;
2319         u8 fs_uuid[BTRFS_FSID_SIZE];
2320         u8 dev_uuid[BTRFS_UUID_SIZE];
2321         u64 devid;
2322         int ret;
2323
2324         path = btrfs_alloc_path();
2325         if (!path)
2326                 return -ENOMEM;
2327
2328         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2329         key.offset = 0;
2330         key.type = BTRFS_DEV_ITEM_KEY;
2331
2332         while (1) {
2333                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2334                 if (ret < 0)
2335                         goto error;
2336
2337                 leaf = path->nodes[0];
2338 next_slot:
2339                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2340                         ret = btrfs_next_leaf(root, path);
2341                         if (ret > 0)
2342                                 break;
2343                         if (ret < 0)
2344                                 goto error;
2345                         leaf = path->nodes[0];
2346                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2347                         btrfs_release_path(path);
2348                         continue;
2349                 }
2350
2351                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2352                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2353                     key.type != BTRFS_DEV_ITEM_KEY)
2354                         break;
2355
2356                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2357                                           struct btrfs_dev_item);
2358                 devid = btrfs_device_id(leaf, dev_item);
2359                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2360                                    BTRFS_UUID_SIZE);
2361                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2362                                    BTRFS_FSID_SIZE);
2363                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2364                 BUG_ON(!device); /* Logic error */
2365
2366                 if (device->fs_devices->seeding) {
2367                         btrfs_set_device_generation(leaf, dev_item,
2368                                                     device->generation);
2369                         btrfs_mark_buffer_dirty(leaf);
2370                 }
2371
2372                 path->slots[0]++;
2373                 goto next_slot;
2374         }
2375         ret = 0;
2376 error:
2377         btrfs_free_path(path);
2378         return ret;
2379 }
2380
2381 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2382 {
2383         struct btrfs_root *root = fs_info->dev_root;
2384         struct request_queue *q;
2385         struct btrfs_trans_handle *trans;
2386         struct btrfs_device *device;
2387         struct block_device *bdev;
2388         struct list_head *devices;
2389         struct super_block *sb = fs_info->sb;
2390         struct rcu_string *name;
2391         u64 tmp;
2392         int seeding_dev = 0;
2393         int ret = 0;
2394         bool unlocked = false;
2395
2396         if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2397                 return -EROFS;
2398
2399         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2400                                   fs_info->bdev_holder);
2401         if (IS_ERR(bdev))
2402                 return PTR_ERR(bdev);
2403
2404         if (fs_info->fs_devices->seeding) {
2405                 seeding_dev = 1;
2406                 down_write(&sb->s_umount);
2407                 mutex_lock(&uuid_mutex);
2408         }
2409
2410         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2411
2412         devices = &fs_info->fs_devices->devices;
2413
2414         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2415         list_for_each_entry(device, devices, dev_list) {
2416                 if (device->bdev == bdev) {
2417                         ret = -EEXIST;
2418                         mutex_unlock(
2419                                 &fs_info->fs_devices->device_list_mutex);
2420                         goto error;
2421                 }
2422         }
2423         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2424
2425         device = btrfs_alloc_device(fs_info, NULL, NULL);
2426         if (IS_ERR(device)) {
2427                 /* we can safely leave the fs_devices entry around */
2428                 ret = PTR_ERR(device);
2429                 goto error;
2430         }
2431
2432         name = rcu_string_strdup(device_path, GFP_KERNEL);
2433         if (!name) {
2434                 ret = -ENOMEM;
2435                 goto error_free_device;
2436         }
2437         rcu_assign_pointer(device->name, name);
2438
2439         trans = btrfs_start_transaction(root, 0);
2440         if (IS_ERR(trans)) {
2441                 ret = PTR_ERR(trans);
2442                 goto error_free_device;
2443         }
2444
2445         q = bdev_get_queue(bdev);
2446         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2447         device->generation = trans->transid;
2448         device->io_width = fs_info->sectorsize;
2449         device->io_align = fs_info->sectorsize;
2450         device->sector_size = fs_info->sectorsize;
2451         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2452                                          fs_info->sectorsize);
2453         device->disk_total_bytes = device->total_bytes;
2454         device->commit_total_bytes = device->total_bytes;
2455         device->fs_info = fs_info;
2456         device->bdev = bdev;
2457         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2458         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2459         device->mode = FMODE_EXCL;
2460         device->dev_stats_valid = 1;
2461         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2462
2463         if (seeding_dev) {
2464                 sb->s_flags &= ~SB_RDONLY;
2465                 ret = btrfs_prepare_sprout(fs_info);
2466                 if (ret) {
2467                         btrfs_abort_transaction(trans, ret);
2468                         goto error_trans;
2469                 }
2470         }
2471
2472         device->fs_devices = fs_info->fs_devices;
2473
2474         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2475         mutex_lock(&fs_info->chunk_mutex);
2476         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2477         list_add(&device->dev_alloc_list,
2478                  &fs_info->fs_devices->alloc_list);
2479         fs_info->fs_devices->num_devices++;
2480         fs_info->fs_devices->open_devices++;
2481         fs_info->fs_devices->rw_devices++;
2482         fs_info->fs_devices->total_devices++;
2483         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2484
2485         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2486
2487         if (!blk_queue_nonrot(q))
2488                 fs_info->fs_devices->rotating = 1;
2489
2490         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2491         btrfs_set_super_total_bytes(fs_info->super_copy,
2492                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2493
2494         tmp = btrfs_super_num_devices(fs_info->super_copy);
2495         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2496
2497         /* add sysfs device entry */
2498         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2499
2500         /*
2501          * we've got more storage, clear any full flags on the space
2502          * infos
2503          */
2504         btrfs_clear_space_info_full(fs_info);
2505
2506         mutex_unlock(&fs_info->chunk_mutex);
2507         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2508
2509         if (seeding_dev) {
2510                 mutex_lock(&fs_info->chunk_mutex);
2511                 ret = init_first_rw_device(trans, fs_info);
2512                 mutex_unlock(&fs_info->chunk_mutex);
2513                 if (ret) {
2514                         btrfs_abort_transaction(trans, ret);
2515                         goto error_sysfs;
2516                 }
2517         }
2518
2519         ret = btrfs_add_dev_item(trans, fs_info, device);
2520         if (ret) {
2521                 btrfs_abort_transaction(trans, ret);
2522                 goto error_sysfs;
2523         }
2524
2525         if (seeding_dev) {
2526                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2527
2528                 ret = btrfs_finish_sprout(trans, fs_info);
2529                 if (ret) {
2530                         btrfs_abort_transaction(trans, ret);
2531                         goto error_sysfs;
2532                 }
2533
2534                 /* Sprouting would change fsid of the mounted root,
2535                  * so rename the fsid on the sysfs
2536                  */
2537                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2538                                                 fs_info->fsid);
2539                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2540                         btrfs_warn(fs_info,
2541                                    "sysfs: failed to create fsid for sprout");
2542         }
2543
2544         ret = btrfs_commit_transaction(trans);
2545
2546         if (seeding_dev) {
2547                 mutex_unlock(&uuid_mutex);
2548                 up_write(&sb->s_umount);
2549                 unlocked = true;
2550
2551                 if (ret) /* transaction commit */
2552                         return ret;
2553
2554                 ret = btrfs_relocate_sys_chunks(fs_info);
2555                 if (ret < 0)
2556                         btrfs_handle_fs_error(fs_info, ret,
2557                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2558                 trans = btrfs_attach_transaction(root);
2559                 if (IS_ERR(trans)) {
2560                         if (PTR_ERR(trans) == -ENOENT)
2561                                 return 0;
2562                         ret = PTR_ERR(trans);
2563                         trans = NULL;
2564                         goto error_sysfs;
2565                 }
2566                 ret = btrfs_commit_transaction(trans);
2567         }
2568
2569         /* Update ctime/mtime for libblkid */
2570         update_dev_time(device_path);
2571         return ret;
2572
2573 error_sysfs:
2574         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2575 error_trans:
2576         if (seeding_dev)
2577                 sb->s_flags |= SB_RDONLY;
2578         if (trans)
2579                 btrfs_end_transaction(trans);
2580 error_free_device:
2581         free_device(device);
2582 error:
2583         blkdev_put(bdev, FMODE_EXCL);
2584         if (seeding_dev && !unlocked) {
2585                 mutex_unlock(&uuid_mutex);
2586                 up_write(&sb->s_umount);
2587         }
2588         return ret;
2589 }
2590
2591 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2592                                   const char *device_path,
2593                                   struct btrfs_device *srcdev,
2594                                   struct btrfs_device **device_out)
2595 {
2596         struct btrfs_device *device;
2597         struct block_device *bdev;
2598         struct list_head *devices;
2599         struct rcu_string *name;
2600         u64 devid = BTRFS_DEV_REPLACE_DEVID;
2601         int ret = 0;
2602
2603         *device_out = NULL;
2604         if (fs_info->fs_devices->seeding) {
2605                 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2606                 return -EINVAL;
2607         }
2608
2609         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2610                                   fs_info->bdev_holder);
2611         if (IS_ERR(bdev)) {
2612                 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2613                 return PTR_ERR(bdev);
2614         }
2615
2616         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2617
2618         devices = &fs_info->fs_devices->devices;
2619         list_for_each_entry(device, devices, dev_list) {
2620                 if (device->bdev == bdev) {
2621                         btrfs_err(fs_info,
2622                                   "target device is in the filesystem!");
2623                         ret = -EEXIST;
2624                         goto error;
2625                 }
2626         }
2627
2628
2629         if (i_size_read(bdev->bd_inode) <
2630             btrfs_device_get_total_bytes(srcdev)) {
2631                 btrfs_err(fs_info,
2632                           "target device is smaller than source device!");
2633                 ret = -EINVAL;
2634                 goto error;
2635         }
2636
2637
2638         device = btrfs_alloc_device(NULL, &devid, NULL);
2639         if (IS_ERR(device)) {
2640                 ret = PTR_ERR(device);
2641                 goto error;
2642         }
2643
2644         name = rcu_string_strdup(device_path, GFP_KERNEL);
2645         if (!name) {
2646                 free_device(device);
2647                 ret = -ENOMEM;
2648                 goto error;
2649         }
2650         rcu_assign_pointer(device->name, name);
2651
2652         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2653         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2654         device->generation = 0;
2655         device->io_width = fs_info->sectorsize;
2656         device->io_align = fs_info->sectorsize;
2657         device->sector_size = fs_info->sectorsize;
2658         device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2659         device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2660         device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2661         ASSERT(list_empty(&srcdev->resized_list));
2662         device->commit_total_bytes = srcdev->commit_total_bytes;
2663         device->commit_bytes_used = device->bytes_used;
2664         device->fs_info = fs_info;
2665         device->bdev = bdev;
2666         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2667         set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2668         device->mode = FMODE_EXCL;
2669         device->dev_stats_valid = 1;
2670         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2671         device->fs_devices = fs_info->fs_devices;
2672         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2673         fs_info->fs_devices->num_devices++;
2674         fs_info->fs_devices->open_devices++;
2675         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2676
2677         *device_out = device;
2678         return ret;
2679
2680 error:
2681         blkdev_put(bdev, FMODE_EXCL);
2682         return ret;
2683 }
2684
2685 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2686                                         struct btrfs_device *device)
2687 {
2688         int ret;
2689         struct btrfs_path *path;
2690         struct btrfs_root *root = device->fs_info->chunk_root;
2691         struct btrfs_dev_item *dev_item;
2692         struct extent_buffer *leaf;
2693         struct btrfs_key key;
2694
2695         path = btrfs_alloc_path();
2696         if (!path)
2697                 return -ENOMEM;
2698
2699         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2700         key.type = BTRFS_DEV_ITEM_KEY;
2701         key.offset = device->devid;
2702
2703         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2704         if (ret < 0)
2705                 goto out;
2706
2707         if (ret > 0) {
2708                 ret = -ENOENT;
2709                 goto out;
2710         }
2711
2712         leaf = path->nodes[0];
2713         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2714
2715         btrfs_set_device_id(leaf, dev_item, device->devid);
2716         btrfs_set_device_type(leaf, dev_item, device->type);
2717         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2718         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2719         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2720         btrfs_set_device_total_bytes(leaf, dev_item,
2721                                      btrfs_device_get_disk_total_bytes(device));
2722         btrfs_set_device_bytes_used(leaf, dev_item,
2723                                     btrfs_device_get_bytes_used(device));
2724         btrfs_mark_buffer_dirty(leaf);
2725
2726 out:
2727         btrfs_free_path(path);
2728         return ret;
2729 }
2730
2731 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2732                       struct btrfs_device *device, u64 new_size)
2733 {
2734         struct btrfs_fs_info *fs_info = device->fs_info;
2735         struct btrfs_super_block *super_copy = fs_info->super_copy;
2736         struct btrfs_fs_devices *fs_devices;
2737         u64 old_total;
2738         u64 diff;
2739
2740         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2741                 return -EACCES;
2742
2743         new_size = round_down(new_size, fs_info->sectorsize);
2744
2745         mutex_lock(&fs_info->chunk_mutex);
2746         old_total = btrfs_super_total_bytes(super_copy);
2747         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2748
2749         if (new_size <= device->total_bytes ||
2750             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2751                 mutex_unlock(&fs_info->chunk_mutex);
2752                 return -EINVAL;
2753         }
2754
2755         fs_devices = fs_info->fs_devices;
2756
2757         btrfs_set_super_total_bytes(super_copy,
2758                         round_down(old_total + diff, fs_info->sectorsize));
2759         device->fs_devices->total_rw_bytes += diff;
2760
2761         btrfs_device_set_total_bytes(device, new_size);
2762         btrfs_device_set_disk_total_bytes(device, new_size);
2763         btrfs_clear_space_info_full(device->fs_info);
2764         if (list_empty(&device->resized_list))
2765                 list_add_tail(&device->resized_list,
2766                               &fs_devices->resized_devices);
2767         mutex_unlock(&fs_info->chunk_mutex);
2768
2769         return btrfs_update_device(trans, device);
2770 }
2771
2772 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2773                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2774 {
2775         struct btrfs_root *root = fs_info->chunk_root;
2776         int ret;
2777         struct btrfs_path *path;
2778         struct btrfs_key key;
2779
2780         path = btrfs_alloc_path();
2781         if (!path)
2782                 return -ENOMEM;
2783
2784         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2785         key.offset = chunk_offset;
2786         key.type = BTRFS_CHUNK_ITEM_KEY;
2787
2788         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2789         if (ret < 0)
2790                 goto out;
2791         else if (ret > 0) { /* Logic error or corruption */
2792                 btrfs_handle_fs_error(fs_info, -ENOENT,
2793                                       "Failed lookup while freeing chunk.");
2794                 ret = -ENOENT;
2795                 goto out;
2796         }
2797
2798         ret = btrfs_del_item(trans, root, path);
2799         if (ret < 0)
2800                 btrfs_handle_fs_error(fs_info, ret,
2801                                       "Failed to delete chunk item.");
2802 out:
2803         btrfs_free_path(path);
2804         return ret;
2805 }
2806
2807 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2808 {
2809         struct btrfs_super_block *super_copy = fs_info->super_copy;
2810         struct btrfs_disk_key *disk_key;
2811         struct btrfs_chunk *chunk;
2812         u8 *ptr;
2813         int ret = 0;
2814         u32 num_stripes;
2815         u32 array_size;
2816         u32 len = 0;
2817         u32 cur;
2818         struct btrfs_key key;
2819
2820         mutex_lock(&fs_info->chunk_mutex);
2821         array_size = btrfs_super_sys_array_size(super_copy);
2822
2823         ptr = super_copy->sys_chunk_array;
2824         cur = 0;
2825
2826         while (cur < array_size) {
2827                 disk_key = (struct btrfs_disk_key *)ptr;
2828                 btrfs_disk_key_to_cpu(&key, disk_key);
2829
2830                 len = sizeof(*disk_key);
2831
2832                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2833                         chunk = (struct btrfs_chunk *)(ptr + len);
2834                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2835                         len += btrfs_chunk_item_size(num_stripes);
2836                 } else {
2837                         ret = -EIO;
2838                         break;
2839                 }
2840                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2841                     key.offset == chunk_offset) {
2842                         memmove(ptr, ptr + len, array_size - (cur + len));
2843                         array_size -= len;
2844                         btrfs_set_super_sys_array_size(super_copy, array_size);
2845                 } else {
2846                         ptr += len;
2847                         cur += len;
2848                 }
2849         }
2850         mutex_unlock(&fs_info->chunk_mutex);
2851         return ret;
2852 }
2853
2854 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2855                                         u64 logical, u64 length)
2856 {
2857         struct extent_map_tree *em_tree;
2858         struct extent_map *em;
2859
2860         em_tree = &fs_info->mapping_tree.map_tree;
2861         read_lock(&em_tree->lock);
2862         em = lookup_extent_mapping(em_tree, logical, length);
2863         read_unlock(&em_tree->lock);
2864
2865         if (!em) {
2866                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2867                            logical, length);
2868                 return ERR_PTR(-EINVAL);
2869         }
2870
2871         if (em->start > logical || em->start + em->len < logical) {
2872                 btrfs_crit(fs_info,
2873                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2874                            logical, length, em->start, em->start + em->len);
2875                 free_extent_map(em);
2876                 return ERR_PTR(-EINVAL);
2877         }
2878
2879         /* callers are responsible for dropping em's ref. */
2880         return em;
2881 }
2882
2883 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2884                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2885 {
2886         struct extent_map *em;
2887         struct map_lookup *map;
2888         u64 dev_extent_len = 0;
2889         int i, ret = 0;
2890         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2891
2892         em = get_chunk_map(fs_info, chunk_offset, 1);
2893         if (IS_ERR(em)) {
2894                 /*
2895                  * This is a logic error, but we don't want to just rely on the
2896                  * user having built with ASSERT enabled, so if ASSERT doesn't
2897                  * do anything we still error out.
2898                  */
2899                 ASSERT(0);
2900                 return PTR_ERR(em);
2901         }
2902         map = em->map_lookup;
2903         mutex_lock(&fs_info->chunk_mutex);
2904         check_system_chunk(trans, fs_info, map->type);
2905         mutex_unlock(&fs_info->chunk_mutex);
2906
2907         /*
2908          * Take the device list mutex to prevent races with the final phase of
2909          * a device replace operation that replaces the device object associated
2910          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2911          */
2912         mutex_lock(&fs_devices->device_list_mutex);
2913         for (i = 0; i < map->num_stripes; i++) {
2914                 struct btrfs_device *device = map->stripes[i].dev;
2915                 ret = btrfs_free_dev_extent(trans, device,
2916                                             map->stripes[i].physical,
2917                                             &dev_extent_len);
2918                 if (ret) {
2919                         mutex_unlock(&fs_devices->device_list_mutex);
2920                         btrfs_abort_transaction(trans, ret);
2921                         goto out;
2922                 }
2923
2924                 if (device->bytes_used > 0) {
2925                         mutex_lock(&fs_info->chunk_mutex);
2926                         btrfs_device_set_bytes_used(device,
2927                                         device->bytes_used - dev_extent_len);
2928                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2929                         btrfs_clear_space_info_full(fs_info);
2930                         mutex_unlock(&fs_info->chunk_mutex);
2931                 }
2932
2933                 if (map->stripes[i].dev) {
2934                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2935                         if (ret) {
2936                                 mutex_unlock(&fs_devices->device_list_mutex);
2937                                 btrfs_abort_transaction(trans, ret);
2938                                 goto out;
2939                         }
2940                 }
2941         }
2942         mutex_unlock(&fs_devices->device_list_mutex);
2943
2944         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2945         if (ret) {
2946                 btrfs_abort_transaction(trans, ret);
2947                 goto out;
2948         }
2949
2950         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2951
2952         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2953                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2954                 if (ret) {
2955                         btrfs_abort_transaction(trans, ret);
2956                         goto out;
2957                 }
2958         }
2959
2960         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2961         if (ret) {
2962                 btrfs_abort_transaction(trans, ret);
2963                 goto out;
2964         }
2965
2966 out:
2967         /* once for us */
2968         free_extent_map(em);
2969         return ret;
2970 }
2971
2972 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2973 {
2974         struct btrfs_root *root = fs_info->chunk_root;
2975         struct btrfs_trans_handle *trans;
2976         int ret;
2977
2978         /*
2979          * Prevent races with automatic removal of unused block groups.
2980          * After we relocate and before we remove the chunk with offset
2981          * chunk_offset, automatic removal of the block group can kick in,
2982          * resulting in a failure when calling btrfs_remove_chunk() below.
2983          *
2984          * Make sure to acquire this mutex before doing a tree search (dev
2985          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2986          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2987          * we release the path used to search the chunk/dev tree and before
2988          * the current task acquires this mutex and calls us.
2989          */
2990         ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
2991
2992         ret = btrfs_can_relocate(fs_info, chunk_offset);
2993         if (ret)
2994                 return -ENOSPC;
2995
2996         /* step one, relocate all the extents inside this chunk */
2997         btrfs_scrub_pause(fs_info);
2998         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2999         btrfs_scrub_continue(fs_info);
3000         if (ret)
3001                 return ret;
3002
3003         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3004                                                      chunk_offset);
3005         if (IS_ERR(trans)) {
3006                 ret = PTR_ERR(trans);
3007                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3008                 return ret;
3009         }
3010
3011         /*
3012          * step two, delete the device extents and the
3013          * chunk tree entries
3014          */
3015         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
3016         btrfs_end_transaction(trans);
3017         return ret;
3018 }
3019
3020 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3021 {
3022         struct btrfs_root *chunk_root = fs_info->chunk_root;
3023         struct btrfs_path *path;
3024         struct extent_buffer *leaf;
3025         struct btrfs_chunk *chunk;
3026         struct btrfs_key key;
3027         struct btrfs_key found_key;
3028         u64 chunk_type;
3029         bool retried = false;
3030         int failed = 0;
3031         int ret;
3032
3033         path = btrfs_alloc_path();
3034         if (!path)
3035                 return -ENOMEM;
3036
3037 again:
3038         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3039         key.offset = (u64)-1;
3040         key.type = BTRFS_CHUNK_ITEM_KEY;
3041
3042         while (1) {
3043                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3044                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3045                 if (ret < 0) {
3046                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3047                         goto error;
3048                 }
3049                 BUG_ON(ret == 0); /* Corruption */
3050
3051                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3052                                           key.type);
3053                 if (ret)
3054                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3055                 if (ret < 0)
3056                         goto error;
3057                 if (ret > 0)
3058                         break;
3059
3060                 leaf = path->nodes[0];
3061                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3062
3063                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3064                                        struct btrfs_chunk);
3065                 chunk_type = btrfs_chunk_type(leaf, chunk);
3066                 btrfs_release_path(path);
3067
3068                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3069                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3070                         if (ret == -ENOSPC)
3071                                 failed++;
3072                         else
3073                                 BUG_ON(ret);
3074                 }
3075                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3076
3077                 if (found_key.offset == 0)
3078                         break;
3079                 key.offset = found_key.offset - 1;
3080         }
3081         ret = 0;
3082         if (failed && !retried) {
3083                 failed = 0;
3084                 retried = true;
3085                 goto again;
3086         } else if (WARN_ON(failed && retried)) {
3087                 ret = -ENOSPC;
3088         }
3089 error:
3090         btrfs_free_path(path);
3091         return ret;
3092 }
3093
3094 /*
3095  * return 1 : allocate a data chunk successfully,
3096  * return <0: errors during allocating a data chunk,
3097  * return 0 : no need to allocate a data chunk.
3098  */
3099 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3100                                       u64 chunk_offset)
3101 {
3102         struct btrfs_block_group_cache *cache;
3103         u64 bytes_used;
3104         u64 chunk_type;
3105
3106         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3107         ASSERT(cache);
3108         chunk_type = cache->flags;
3109         btrfs_put_block_group(cache);
3110
3111         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3112                 spin_lock(&fs_info->data_sinfo->lock);
3113                 bytes_used = fs_info->data_sinfo->bytes_used;
3114                 spin_unlock(&fs_info->data_sinfo->lock);
3115
3116                 if (!bytes_used) {
3117                         struct btrfs_trans_handle *trans;
3118                         int ret;
3119
3120                         trans = btrfs_join_transaction(fs_info->tree_root);
3121                         if (IS_ERR(trans))
3122                                 return PTR_ERR(trans);
3123
3124                         ret = btrfs_force_chunk_alloc(trans, fs_info,
3125                                                       BTRFS_BLOCK_GROUP_DATA);
3126                         btrfs_end_transaction(trans);
3127                         if (ret < 0)
3128                                 return ret;
3129
3130                         return 1;
3131                 }
3132         }
3133         return 0;
3134 }
3135
3136 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3137                                struct btrfs_balance_control *bctl)
3138 {
3139         struct btrfs_root *root = fs_info->tree_root;
3140         struct btrfs_trans_handle *trans;
3141         struct btrfs_balance_item *item;
3142         struct btrfs_disk_balance_args disk_bargs;
3143         struct btrfs_path *path;
3144         struct extent_buffer *leaf;
3145         struct btrfs_key key;
3146         int ret, err;
3147
3148         path = btrfs_alloc_path();
3149         if (!path)
3150                 return -ENOMEM;
3151
3152         trans = btrfs_start_transaction(root, 0);
3153         if (IS_ERR(trans)) {
3154                 btrfs_free_path(path);
3155                 return PTR_ERR(trans);
3156         }
3157
3158         key.objectid = BTRFS_BALANCE_OBJECTID;
3159         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3160         key.offset = 0;
3161
3162         ret = btrfs_insert_empty_item(trans, root, path, &key,
3163                                       sizeof(*item));
3164         if (ret)
3165                 goto out;
3166
3167         leaf = path->nodes[0];
3168         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3169
3170         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3171
3172         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3173         btrfs_set_balance_data(leaf, item, &disk_bargs);
3174         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3175         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3176         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3177         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3178
3179         btrfs_set_balance_flags(leaf, item, bctl->flags);
3180
3181         btrfs_mark_buffer_dirty(leaf);
3182 out:
3183         btrfs_free_path(path);
3184         err = btrfs_commit_transaction(trans);
3185         if (err && !ret)
3186                 ret = err;
3187         return ret;
3188 }
3189
3190 static int del_balance_item(struct btrfs_fs_info *fs_info)
3191 {
3192         struct btrfs_root *root = fs_info->tree_root;
3193         struct btrfs_trans_handle *trans;
3194         struct btrfs_path *path;
3195         struct btrfs_key key;
3196         int ret, err;
3197
3198         path = btrfs_alloc_path();
3199         if (!path)
3200                 return -ENOMEM;
3201
3202         trans = btrfs_start_transaction(root, 0);
3203         if (IS_ERR(trans)) {
3204                 btrfs_free_path(path);
3205                 return PTR_ERR(trans);
3206         }
3207
3208         key.objectid = BTRFS_BALANCE_OBJECTID;
3209         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3210         key.offset = 0;
3211
3212         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3213         if (ret < 0)
3214                 goto out;
3215         if (ret > 0) {
3216                 ret = -ENOENT;
3217                 goto out;
3218         }
3219
3220         ret = btrfs_del_item(trans, root, path);
3221 out:
3222         btrfs_free_path(path);
3223         err = btrfs_commit_transaction(trans);
3224         if (err && !ret)
3225                 ret = err;
3226         return ret;
3227 }
3228
3229 /*
3230  * This is a heuristic used to reduce the number of chunks balanced on
3231  * resume after balance was interrupted.
3232  */
3233 static void update_balance_args(struct btrfs_balance_control *bctl)
3234 {
3235         /*
3236          * Turn on soft mode for chunk types that were being converted.
3237          */
3238         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3239                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3240         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3241                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3242         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3243                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;