btrfs: move btrfs_raid_group values to btrfs_raid_attr table
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/iocontext.h>
12 #include <linux/capability.h>
13 #include <linux/ratelimit.h>
14 #include <linux/kthread.h>
15 #include <linux/raid/pq.h>
16 #include <linux/semaphore.h>
17 #include <linux/uuid.h>
18 #include <linux/list_sort.h>
19 #include <asm/div64.h>
20 #include "ctree.h"
21 #include "extent_map.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "volumes.h"
26 #include "raid56.h"
27 #include "async-thread.h"
28 #include "check-integrity.h"
29 #include "rcu-string.h"
30 #include "math.h"
31 #include "dev-replace.h"
32 #include "sysfs.h"
33
34 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
35         [BTRFS_RAID_RAID10] = {
36                 .sub_stripes    = 2,
37                 .dev_stripes    = 1,
38                 .devs_max       = 0,    /* 0 == as many as possible */
39                 .devs_min       = 4,
40                 .tolerated_failures = 1,
41                 .devs_increment = 2,
42                 .ncopies        = 2,
43                 .raid_name      = "raid10",
44                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
45         },
46         [BTRFS_RAID_RAID1] = {
47                 .sub_stripes    = 1,
48                 .dev_stripes    = 1,
49                 .devs_max       = 2,
50                 .devs_min       = 2,
51                 .tolerated_failures = 1,
52                 .devs_increment = 2,
53                 .ncopies        = 2,
54                 .raid_name      = "raid1",
55                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
56         },
57         [BTRFS_RAID_DUP] = {
58                 .sub_stripes    = 1,
59                 .dev_stripes    = 2,
60                 .devs_max       = 1,
61                 .devs_min       = 1,
62                 .tolerated_failures = 0,
63                 .devs_increment = 1,
64                 .ncopies        = 2,
65                 .raid_name      = "dup",
66                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
67         },
68         [BTRFS_RAID_RAID0] = {
69                 .sub_stripes    = 1,
70                 .dev_stripes    = 1,
71                 .devs_max       = 0,
72                 .devs_min       = 2,
73                 .tolerated_failures = 0,
74                 .devs_increment = 1,
75                 .ncopies        = 1,
76                 .raid_name      = "raid0",
77                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
78         },
79         [BTRFS_RAID_SINGLE] = {
80                 .sub_stripes    = 1,
81                 .dev_stripes    = 1,
82                 .devs_max       = 1,
83                 .devs_min       = 1,
84                 .tolerated_failures = 0,
85                 .devs_increment = 1,
86                 .ncopies        = 1,
87                 .raid_name      = "single",
88                 .bg_flag        = 0,
89         },
90         [BTRFS_RAID_RAID5] = {
91                 .sub_stripes    = 1,
92                 .dev_stripes    = 1,
93                 .devs_max       = 0,
94                 .devs_min       = 2,
95                 .tolerated_failures = 1,
96                 .devs_increment = 1,
97                 .ncopies        = 2,
98                 .raid_name      = "raid5",
99                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
100         },
101         [BTRFS_RAID_RAID6] = {
102                 .sub_stripes    = 1,
103                 .dev_stripes    = 1,
104                 .devs_max       = 0,
105                 .devs_min       = 3,
106                 .tolerated_failures = 2,
107                 .devs_increment = 1,
108                 .ncopies        = 3,
109                 .raid_name      = "raid6",
110                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
111         },
112 };
113
114 const char *get_raid_name(enum btrfs_raid_types type)
115 {
116         if (type >= BTRFS_NR_RAID_TYPES)
117                 return NULL;
118
119         return btrfs_raid_array[type].raid_name;
120 }
121
122 /*
123  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
124  * condition is not met. Zero means there's no corresponding
125  * BTRFS_ERROR_DEV_*_NOT_MET value.
126  */
127 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
128         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
129         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
130         [BTRFS_RAID_DUP]    = 0,
131         [BTRFS_RAID_RAID0]  = 0,
132         [BTRFS_RAID_SINGLE] = 0,
133         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
134         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
135 };
136
137 static int init_first_rw_device(struct btrfs_trans_handle *trans,
138                                 struct btrfs_fs_info *fs_info);
139 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
140 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
141 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
142 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
143 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
144                              enum btrfs_map_op op,
145                              u64 logical, u64 *length,
146                              struct btrfs_bio **bbio_ret,
147                              int mirror_num, int need_raid_map);
148
149 /*
150  * Device locking
151  * ==============
152  *
153  * There are several mutexes that protect manipulation of devices and low-level
154  * structures like chunks but not block groups, extents or files
155  *
156  * uuid_mutex (global lock)
157  * ------------------------
158  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
159  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
160  * device) or requested by the device= mount option
161  *
162  * the mutex can be very coarse and can cover long-running operations
163  *
164  * protects: updates to fs_devices counters like missing devices, rw devices,
165  * seeding, structure cloning, openning/closing devices at mount/umount time
166  *
167  * global::fs_devs - add, remove, updates to the global list
168  *
169  * does not protect: manipulation of the fs_devices::devices list!
170  *
171  * btrfs_device::name - renames (write side), read is RCU
172  *
173  * fs_devices::device_list_mutex (per-fs, with RCU)
174  * ------------------------------------------------
175  * protects updates to fs_devices::devices, ie. adding and deleting
176  *
177  * simple list traversal with read-only actions can be done with RCU protection
178  *
179  * may be used to exclude some operations from running concurrently without any
180  * modifications to the list (see write_all_supers)
181  *
182  * balance_mutex
183  * -------------
184  * protects balance structures (status, state) and context accessed from
185  * several places (internally, ioctl)
186  *
187  * chunk_mutex
188  * -----------
189  * protects chunks, adding or removing during allocation, trim or when a new
190  * device is added/removed
191  *
192  * cleaner_mutex
193  * -------------
194  * a big lock that is held by the cleaner thread and prevents running subvolume
195  * cleaning together with relocation or delayed iputs
196  *
197  *
198  * Lock nesting
199  * ============
200  *
201  * uuid_mutex
202  *   volume_mutex
203  *     device_list_mutex
204  *       chunk_mutex
205  *     balance_mutex
206  *
207  *
208  * Exclusive operations, BTRFS_FS_EXCL_OP
209  * ======================================
210  *
211  * Maintains the exclusivity of the following operations that apply to the
212  * whole filesystem and cannot run in parallel.
213  *
214  * - Balance (*)
215  * - Device add
216  * - Device remove
217  * - Device replace (*)
218  * - Resize
219  *
220  * The device operations (as above) can be in one of the following states:
221  *
222  * - Running state
223  * - Paused state
224  * - Completed state
225  *
226  * Only device operations marked with (*) can go into the Paused state for the
227  * following reasons:
228  *
229  * - ioctl (only Balance can be Paused through ioctl)
230  * - filesystem remounted as read-only
231  * - filesystem unmounted and mounted as read-only
232  * - system power-cycle and filesystem mounted as read-only
233  * - filesystem or device errors leading to forced read-only
234  *
235  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
236  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
237  * A device operation in Paused or Running state can be canceled or resumed
238  * either by ioctl (Balance only) or when remounted as read-write.
239  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
240  * completed.
241  */
242
243 DEFINE_MUTEX(uuid_mutex);
244 static LIST_HEAD(fs_uuids);
245 struct list_head *btrfs_get_fs_uuids(void)
246 {
247         return &fs_uuids;
248 }
249
250 /*
251  * alloc_fs_devices - allocate struct btrfs_fs_devices
252  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
253  *
254  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
255  * The returned struct is not linked onto any lists and can be destroyed with
256  * kfree() right away.
257  */
258 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
259 {
260         struct btrfs_fs_devices *fs_devs;
261
262         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
263         if (!fs_devs)
264                 return ERR_PTR(-ENOMEM);
265
266         mutex_init(&fs_devs->device_list_mutex);
267
268         INIT_LIST_HEAD(&fs_devs->devices);
269         INIT_LIST_HEAD(&fs_devs->resized_devices);
270         INIT_LIST_HEAD(&fs_devs->alloc_list);
271         INIT_LIST_HEAD(&fs_devs->fs_list);
272         if (fsid)
273                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
274
275         return fs_devs;
276 }
277
278 void btrfs_free_device(struct btrfs_device *device)
279 {
280         rcu_string_free(device->name);
281         bio_put(device->flush_bio);
282         kfree(device);
283 }
284
285 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
286 {
287         struct btrfs_device *device;
288         WARN_ON(fs_devices->opened);
289         while (!list_empty(&fs_devices->devices)) {
290                 device = list_entry(fs_devices->devices.next,
291                                     struct btrfs_device, dev_list);
292                 list_del(&device->dev_list);
293                 btrfs_free_device(device);
294         }
295         kfree(fs_devices);
296 }
297
298 static void btrfs_kobject_uevent(struct block_device *bdev,
299                                  enum kobject_action action)
300 {
301         int ret;
302
303         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
304         if (ret)
305                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
306                         action,
307                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
308                         &disk_to_dev(bdev->bd_disk)->kobj);
309 }
310
311 void __exit btrfs_cleanup_fs_uuids(void)
312 {
313         struct btrfs_fs_devices *fs_devices;
314
315         while (!list_empty(&fs_uuids)) {
316                 fs_devices = list_entry(fs_uuids.next,
317                                         struct btrfs_fs_devices, fs_list);
318                 list_del(&fs_devices->fs_list);
319                 free_fs_devices(fs_devices);
320         }
321 }
322
323 /*
324  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
325  * Returned struct is not linked onto any lists and must be destroyed using
326  * btrfs_free_device.
327  */
328 static struct btrfs_device *__alloc_device(void)
329 {
330         struct btrfs_device *dev;
331
332         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
333         if (!dev)
334                 return ERR_PTR(-ENOMEM);
335
336         /*
337          * Preallocate a bio that's always going to be used for flushing device
338          * barriers and matches the device lifespan
339          */
340         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
341         if (!dev->flush_bio) {
342                 kfree(dev);
343                 return ERR_PTR(-ENOMEM);
344         }
345
346         INIT_LIST_HEAD(&dev->dev_list);
347         INIT_LIST_HEAD(&dev->dev_alloc_list);
348         INIT_LIST_HEAD(&dev->resized_list);
349
350         spin_lock_init(&dev->io_lock);
351
352         atomic_set(&dev->reada_in_flight, 0);
353         atomic_set(&dev->dev_stats_ccnt, 0);
354         btrfs_device_data_ordered_init(dev);
355         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
356         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
357
358         return dev;
359 }
360
361 /*
362  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
363  * return NULL.
364  *
365  * If devid and uuid are both specified, the match must be exact, otherwise
366  * only devid is used.
367  */
368 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
369                 u64 devid, const u8 *uuid)
370 {
371         struct btrfs_device *dev;
372
373         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
374                 if (dev->devid == devid &&
375                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
376                         return dev;
377                 }
378         }
379         return NULL;
380 }
381
382 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
383 {
384         struct btrfs_fs_devices *fs_devices;
385
386         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
387                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
388                         return fs_devices;
389         }
390         return NULL;
391 }
392
393 static int
394 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
395                       int flush, struct block_device **bdev,
396                       struct buffer_head **bh)
397 {
398         int ret;
399
400         *bdev = blkdev_get_by_path(device_path, flags, holder);
401
402         if (IS_ERR(*bdev)) {
403                 ret = PTR_ERR(*bdev);
404                 goto error;
405         }
406
407         if (flush)
408                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
409         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
410         if (ret) {
411                 blkdev_put(*bdev, flags);
412                 goto error;
413         }
414         invalidate_bdev(*bdev);
415         *bh = btrfs_read_dev_super(*bdev);
416         if (IS_ERR(*bh)) {
417                 ret = PTR_ERR(*bh);
418                 blkdev_put(*bdev, flags);
419                 goto error;
420         }
421
422         return 0;
423
424 error:
425         *bdev = NULL;
426         *bh = NULL;
427         return ret;
428 }
429
430 static void requeue_list(struct btrfs_pending_bios *pending_bios,
431                         struct bio *head, struct bio *tail)
432 {
433
434         struct bio *old_head;
435
436         old_head = pending_bios->head;
437         pending_bios->head = head;
438         if (pending_bios->tail)
439                 tail->bi_next = old_head;
440         else
441                 pending_bios->tail = tail;
442 }
443
444 /*
445  * we try to collect pending bios for a device so we don't get a large
446  * number of procs sending bios down to the same device.  This greatly
447  * improves the schedulers ability to collect and merge the bios.
448  *
449  * But, it also turns into a long list of bios to process and that is sure
450  * to eventually make the worker thread block.  The solution here is to
451  * make some progress and then put this work struct back at the end of
452  * the list if the block device is congested.  This way, multiple devices
453  * can make progress from a single worker thread.
454  */
455 static noinline void run_scheduled_bios(struct btrfs_device *device)
456 {
457         struct btrfs_fs_info *fs_info = device->fs_info;
458         struct bio *pending;
459         struct backing_dev_info *bdi;
460         struct btrfs_pending_bios *pending_bios;
461         struct bio *tail;
462         struct bio *cur;
463         int again = 0;
464         unsigned long num_run;
465         unsigned long batch_run = 0;
466         unsigned long last_waited = 0;
467         int force_reg = 0;
468         int sync_pending = 0;
469         struct blk_plug plug;
470
471         /*
472          * this function runs all the bios we've collected for
473          * a particular device.  We don't want to wander off to
474          * another device without first sending all of these down.
475          * So, setup a plug here and finish it off before we return
476          */
477         blk_start_plug(&plug);
478
479         bdi = device->bdev->bd_bdi;
480
481 loop:
482         spin_lock(&device->io_lock);
483
484 loop_lock:
485         num_run = 0;
486
487         /* take all the bios off the list at once and process them
488          * later on (without the lock held).  But, remember the
489          * tail and other pointers so the bios can be properly reinserted
490          * into the list if we hit congestion
491          */
492         if (!force_reg && device->pending_sync_bios.head) {
493                 pending_bios = &device->pending_sync_bios;
494                 force_reg = 1;
495         } else {
496                 pending_bios = &device->pending_bios;
497                 force_reg = 0;
498         }
499
500         pending = pending_bios->head;
501         tail = pending_bios->tail;
502         WARN_ON(pending && !tail);
503
504         /*
505          * if pending was null this time around, no bios need processing
506          * at all and we can stop.  Otherwise it'll loop back up again
507          * and do an additional check so no bios are missed.
508          *
509          * device->running_pending is used to synchronize with the
510          * schedule_bio code.
511          */
512         if (device->pending_sync_bios.head == NULL &&
513             device->pending_bios.head == NULL) {
514                 again = 0;
515                 device->running_pending = 0;
516         } else {
517                 again = 1;
518                 device->running_pending = 1;
519         }
520
521         pending_bios->head = NULL;
522         pending_bios->tail = NULL;
523
524         spin_unlock(&device->io_lock);
525
526         while (pending) {
527
528                 rmb();
529                 /* we want to work on both lists, but do more bios on the
530                  * sync list than the regular list
531                  */
532                 if ((num_run > 32 &&
533                     pending_bios != &device->pending_sync_bios &&
534                     device->pending_sync_bios.head) ||
535                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
536                     device->pending_bios.head)) {
537                         spin_lock(&device->io_lock);
538                         requeue_list(pending_bios, pending, tail);
539                         goto loop_lock;
540                 }
541
542                 cur = pending;
543                 pending = pending->bi_next;
544                 cur->bi_next = NULL;
545
546                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
547
548                 /*
549                  * if we're doing the sync list, record that our
550                  * plug has some sync requests on it
551                  *
552                  * If we're doing the regular list and there are
553                  * sync requests sitting around, unplug before
554                  * we add more
555                  */
556                 if (pending_bios == &device->pending_sync_bios) {
557                         sync_pending = 1;
558                 } else if (sync_pending) {
559                         blk_finish_plug(&plug);
560                         blk_start_plug(&plug);
561                         sync_pending = 0;
562                 }
563
564                 btrfsic_submit_bio(cur);
565                 num_run++;
566                 batch_run++;
567
568                 cond_resched();
569
570                 /*
571                  * we made progress, there is more work to do and the bdi
572                  * is now congested.  Back off and let other work structs
573                  * run instead
574                  */
575                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
576                     fs_info->fs_devices->open_devices > 1) {
577                         struct io_context *ioc;
578
579                         ioc = current->io_context;
580
581                         /*
582                          * the main goal here is that we don't want to
583                          * block if we're going to be able to submit
584                          * more requests without blocking.
585                          *
586                          * This code does two great things, it pokes into
587                          * the elevator code from a filesystem _and_
588                          * it makes assumptions about how batching works.
589                          */
590                         if (ioc && ioc->nr_batch_requests > 0 &&
591                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
592                             (last_waited == 0 ||
593                              ioc->last_waited == last_waited)) {
594                                 /*
595                                  * we want to go through our batch of
596                                  * requests and stop.  So, we copy out
597                                  * the ioc->last_waited time and test
598                                  * against it before looping
599                                  */
600                                 last_waited = ioc->last_waited;
601                                 cond_resched();
602                                 continue;
603                         }
604                         spin_lock(&device->io_lock);
605                         requeue_list(pending_bios, pending, tail);
606                         device->running_pending = 1;
607
608                         spin_unlock(&device->io_lock);
609                         btrfs_queue_work(fs_info->submit_workers,
610                                          &device->work);
611                         goto done;
612                 }
613         }
614
615         cond_resched();
616         if (again)
617                 goto loop;
618
619         spin_lock(&device->io_lock);
620         if (device->pending_bios.head || device->pending_sync_bios.head)
621                 goto loop_lock;
622         spin_unlock(&device->io_lock);
623
624 done:
625         blk_finish_plug(&plug);
626 }
627
628 static void pending_bios_fn(struct btrfs_work *work)
629 {
630         struct btrfs_device *device;
631
632         device = container_of(work, struct btrfs_device, work);
633         run_scheduled_bios(device);
634 }
635
636 /*
637  *  Search and remove all stale (devices which are not mounted) devices.
638  *  When both inputs are NULL, it will search and release all stale devices.
639  *  path:       Optional. When provided will it release all unmounted devices
640  *              matching this path only.
641  *  skip_dev:   Optional. Will skip this device when searching for the stale
642  *              devices.
643  */
644 static void btrfs_free_stale_devices(const char *path,
645                                      struct btrfs_device *skip_dev)
646 {
647         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
648         struct btrfs_device *dev, *tmp_dev;
649
650         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
651
652                 if (fs_devs->opened)
653                         continue;
654
655                 list_for_each_entry_safe(dev, tmp_dev,
656                                          &fs_devs->devices, dev_list) {
657                         int not_found = 0;
658
659                         if (skip_dev && skip_dev == dev)
660                                 continue;
661                         if (path && !dev->name)
662                                 continue;
663
664                         rcu_read_lock();
665                         if (path)
666                                 not_found = strcmp(rcu_str_deref(dev->name),
667                                                    path);
668                         rcu_read_unlock();
669                         if (not_found)
670                                 continue;
671
672                         /* delete the stale device */
673                         if (fs_devs->num_devices == 1) {
674                                 btrfs_sysfs_remove_fsid(fs_devs);
675                                 list_del(&fs_devs->fs_list);
676                                 free_fs_devices(fs_devs);
677                                 break;
678                         } else {
679                                 fs_devs->num_devices--;
680                                 list_del(&dev->dev_list);
681                                 btrfs_free_device(dev);
682                         }
683                 }
684         }
685 }
686
687 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
688                         struct btrfs_device *device, fmode_t flags,
689                         void *holder)
690 {
691         struct request_queue *q;
692         struct block_device *bdev;
693         struct buffer_head *bh;
694         struct btrfs_super_block *disk_super;
695         u64 devid;
696         int ret;
697
698         if (device->bdev)
699                 return -EINVAL;
700         if (!device->name)
701                 return -EINVAL;
702
703         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
704                                     &bdev, &bh);
705         if (ret)
706                 return ret;
707
708         disk_super = (struct btrfs_super_block *)bh->b_data;
709         devid = btrfs_stack_device_id(&disk_super->dev_item);
710         if (devid != device->devid)
711                 goto error_brelse;
712
713         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
714                 goto error_brelse;
715
716         device->generation = btrfs_super_generation(disk_super);
717
718         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
719                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
720                 fs_devices->seeding = 1;
721         } else {
722                 if (bdev_read_only(bdev))
723                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
724                 else
725                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
726         }
727
728         q = bdev_get_queue(bdev);
729         if (!blk_queue_nonrot(q))
730                 fs_devices->rotating = 1;
731
732         device->bdev = bdev;
733         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
734         device->mode = flags;
735
736         fs_devices->open_devices++;
737         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
738             device->devid != BTRFS_DEV_REPLACE_DEVID) {
739                 fs_devices->rw_devices++;
740                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
741         }
742         brelse(bh);
743
744         return 0;
745
746 error_brelse:
747         brelse(bh);
748         blkdev_put(bdev, flags);
749
750         return -EINVAL;
751 }
752
753 /*
754  * Add new device to list of registered devices
755  *
756  * Returns:
757  * device pointer which was just added or updated when successful
758  * error pointer when failed
759  */
760 static noinline struct btrfs_device *device_list_add(const char *path,
761                            struct btrfs_super_block *disk_super)
762 {
763         struct btrfs_device *device;
764         struct btrfs_fs_devices *fs_devices;
765         struct rcu_string *name;
766         u64 found_transid = btrfs_super_generation(disk_super);
767         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
768
769         fs_devices = find_fsid(disk_super->fsid);
770         if (!fs_devices) {
771                 fs_devices = alloc_fs_devices(disk_super->fsid);
772                 if (IS_ERR(fs_devices))
773                         return ERR_CAST(fs_devices);
774
775                 list_add(&fs_devices->fs_list, &fs_uuids);
776
777                 device = NULL;
778         } else {
779                 device = find_device(fs_devices, devid,
780                                 disk_super->dev_item.uuid);
781         }
782
783         if (!device) {
784                 if (fs_devices->opened)
785                         return ERR_PTR(-EBUSY);
786
787                 device = btrfs_alloc_device(NULL, &devid,
788                                             disk_super->dev_item.uuid);
789                 if (IS_ERR(device)) {
790                         /* we can safely leave the fs_devices entry around */
791                         return device;
792                 }
793
794                 name = rcu_string_strdup(path, GFP_NOFS);
795                 if (!name) {
796                         btrfs_free_device(device);
797                         return ERR_PTR(-ENOMEM);
798                 }
799                 rcu_assign_pointer(device->name, name);
800
801                 mutex_lock(&fs_devices->device_list_mutex);
802                 list_add_rcu(&device->dev_list, &fs_devices->devices);
803                 fs_devices->num_devices++;
804                 mutex_unlock(&fs_devices->device_list_mutex);
805
806                 device->fs_devices = fs_devices;
807                 btrfs_free_stale_devices(path, device);
808
809                 if (disk_super->label[0])
810                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
811                                 disk_super->label, devid, found_transid, path);
812                 else
813                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
814                                 disk_super->fsid, devid, found_transid, path);
815
816         } else if (!device->name || strcmp(device->name->str, path)) {
817                 /*
818                  * When FS is already mounted.
819                  * 1. If you are here and if the device->name is NULL that
820                  *    means this device was missing at time of FS mount.
821                  * 2. If you are here and if the device->name is different
822                  *    from 'path' that means either
823                  *      a. The same device disappeared and reappeared with
824                  *         different name. or
825                  *      b. The missing-disk-which-was-replaced, has
826                  *         reappeared now.
827                  *
828                  * We must allow 1 and 2a above. But 2b would be a spurious
829                  * and unintentional.
830                  *
831                  * Further in case of 1 and 2a above, the disk at 'path'
832                  * would have missed some transaction when it was away and
833                  * in case of 2a the stale bdev has to be updated as well.
834                  * 2b must not be allowed at all time.
835                  */
836
837                 /*
838                  * For now, we do allow update to btrfs_fs_device through the
839                  * btrfs dev scan cli after FS has been mounted.  We're still
840                  * tracking a problem where systems fail mount by subvolume id
841                  * when we reject replacement on a mounted FS.
842                  */
843                 if (!fs_devices->opened && found_transid < device->generation) {
844                         /*
845                          * That is if the FS is _not_ mounted and if you
846                          * are here, that means there is more than one
847                          * disk with same uuid and devid.We keep the one
848                          * with larger generation number or the last-in if
849                          * generation are equal.
850                          */
851                         return ERR_PTR(-EEXIST);
852                 }
853
854                 name = rcu_string_strdup(path, GFP_NOFS);
855                 if (!name)
856                         return ERR_PTR(-ENOMEM);
857                 rcu_string_free(device->name);
858                 rcu_assign_pointer(device->name, name);
859                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
860                         fs_devices->missing_devices--;
861                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
862                 }
863         }
864
865         /*
866          * Unmount does not free the btrfs_device struct but would zero
867          * generation along with most of the other members. So just update
868          * it back. We need it to pick the disk with largest generation
869          * (as above).
870          */
871         if (!fs_devices->opened)
872                 device->generation = found_transid;
873
874         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
875
876         return device;
877 }
878
879 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
880 {
881         struct btrfs_fs_devices *fs_devices;
882         struct btrfs_device *device;
883         struct btrfs_device *orig_dev;
884
885         fs_devices = alloc_fs_devices(orig->fsid);
886         if (IS_ERR(fs_devices))
887                 return fs_devices;
888
889         mutex_lock(&orig->device_list_mutex);
890         fs_devices->total_devices = orig->total_devices;
891
892         /* We have held the volume lock, it is safe to get the devices. */
893         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
894                 struct rcu_string *name;
895
896                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
897                                             orig_dev->uuid);
898                 if (IS_ERR(device))
899                         goto error;
900
901                 /*
902                  * This is ok to do without rcu read locked because we hold the
903                  * uuid mutex so nothing we touch in here is going to disappear.
904                  */
905                 if (orig_dev->name) {
906                         name = rcu_string_strdup(orig_dev->name->str,
907                                         GFP_KERNEL);
908                         if (!name) {
909                                 btrfs_free_device(device);
910                                 goto error;
911                         }
912                         rcu_assign_pointer(device->name, name);
913                 }
914
915                 list_add(&device->dev_list, &fs_devices->devices);
916                 device->fs_devices = fs_devices;
917                 fs_devices->num_devices++;
918         }
919         mutex_unlock(&orig->device_list_mutex);
920         return fs_devices;
921 error:
922         mutex_unlock(&orig->device_list_mutex);
923         free_fs_devices(fs_devices);
924         return ERR_PTR(-ENOMEM);
925 }
926
927 /*
928  * After we have read the system tree and know devids belonging to
929  * this filesystem, remove the device which does not belong there.
930  */
931 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
932 {
933         struct btrfs_device *device, *next;
934         struct btrfs_device *latest_dev = NULL;
935
936         mutex_lock(&uuid_mutex);
937 again:
938         /* This is the initialized path, it is safe to release the devices. */
939         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
940                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
941                                                         &device->dev_state)) {
942                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
943                              &device->dev_state) &&
944                              (!latest_dev ||
945                               device->generation > latest_dev->generation)) {
946                                 latest_dev = device;
947                         }
948                         continue;
949                 }
950
951                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
952                         /*
953                          * In the first step, keep the device which has
954                          * the correct fsid and the devid that is used
955                          * for the dev_replace procedure.
956                          * In the second step, the dev_replace state is
957                          * read from the device tree and it is known
958                          * whether the procedure is really active or
959                          * not, which means whether this device is
960                          * used or whether it should be removed.
961                          */
962                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
963                                                   &device->dev_state)) {
964                                 continue;
965                         }
966                 }
967                 if (device->bdev) {
968                         blkdev_put(device->bdev, device->mode);
969                         device->bdev = NULL;
970                         fs_devices->open_devices--;
971                 }
972                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
973                         list_del_init(&device->dev_alloc_list);
974                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
975                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
976                                       &device->dev_state))
977                                 fs_devices->rw_devices--;
978                 }
979                 list_del_init(&device->dev_list);
980                 fs_devices->num_devices--;
981                 btrfs_free_device(device);
982         }
983
984         if (fs_devices->seed) {
985                 fs_devices = fs_devices->seed;
986                 goto again;
987         }
988
989         fs_devices->latest_bdev = latest_dev->bdev;
990
991         mutex_unlock(&uuid_mutex);
992 }
993
994 static void free_device_rcu(struct rcu_head *head)
995 {
996         struct btrfs_device *device;
997
998         device = container_of(head, struct btrfs_device, rcu);
999         btrfs_free_device(device);
1000 }
1001
1002 static void btrfs_close_bdev(struct btrfs_device *device)
1003 {
1004         if (!device->bdev)
1005                 return;
1006
1007         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1008                 sync_blockdev(device->bdev);
1009                 invalidate_bdev(device->bdev);
1010         }
1011
1012         blkdev_put(device->bdev, device->mode);
1013 }
1014
1015 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1016 {
1017         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1018         struct btrfs_device *new_device;
1019         struct rcu_string *name;
1020
1021         if (device->bdev)
1022                 fs_devices->open_devices--;
1023
1024         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1025             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1026                 list_del_init(&device->dev_alloc_list);
1027                 fs_devices->rw_devices--;
1028         }
1029
1030         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1031                 fs_devices->missing_devices--;
1032
1033         new_device = btrfs_alloc_device(NULL, &device->devid,
1034                                         device->uuid);
1035         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1036
1037         /* Safe because we are under uuid_mutex */
1038         if (device->name) {
1039                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1040                 BUG_ON(!name); /* -ENOMEM */
1041                 rcu_assign_pointer(new_device->name, name);
1042         }
1043
1044         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1045         new_device->fs_devices = device->fs_devices;
1046 }
1047
1048 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1049 {
1050         struct btrfs_device *device, *tmp;
1051         struct list_head pending_put;
1052
1053         INIT_LIST_HEAD(&pending_put);
1054
1055         if (--fs_devices->opened > 0)
1056                 return 0;
1057
1058         mutex_lock(&fs_devices->device_list_mutex);
1059         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1060                 btrfs_prepare_close_one_device(device);
1061                 list_add(&device->dev_list, &pending_put);
1062         }
1063         mutex_unlock(&fs_devices->device_list_mutex);
1064
1065         /*
1066          * btrfs_show_devname() is using the device_list_mutex,
1067          * sometimes call to blkdev_put() leads vfs calling
1068          * into this func. So do put outside of device_list_mutex,
1069          * as of now.
1070          */
1071         while (!list_empty(&pending_put)) {
1072                 device = list_first_entry(&pending_put,
1073                                 struct btrfs_device, dev_list);
1074                 list_del(&device->dev_list);
1075                 btrfs_close_bdev(device);
1076                 call_rcu(&device->rcu, free_device_rcu);
1077         }
1078
1079         WARN_ON(fs_devices->open_devices);
1080         WARN_ON(fs_devices->rw_devices);
1081         fs_devices->opened = 0;
1082         fs_devices->seeding = 0;
1083
1084         return 0;
1085 }
1086
1087 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1088 {
1089         struct btrfs_fs_devices *seed_devices = NULL;
1090         int ret;
1091
1092         mutex_lock(&uuid_mutex);
1093         ret = close_fs_devices(fs_devices);
1094         if (!fs_devices->opened) {
1095                 seed_devices = fs_devices->seed;
1096                 fs_devices->seed = NULL;
1097         }
1098         mutex_unlock(&uuid_mutex);
1099
1100         while (seed_devices) {
1101                 fs_devices = seed_devices;
1102                 seed_devices = fs_devices->seed;
1103                 close_fs_devices(fs_devices);
1104                 free_fs_devices(fs_devices);
1105         }
1106         return ret;
1107 }
1108
1109 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1110                                 fmode_t flags, void *holder)
1111 {
1112         struct btrfs_device *device;
1113         struct btrfs_device *latest_dev = NULL;
1114         int ret = 0;
1115
1116         flags |= FMODE_EXCL;
1117
1118         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1119                 /* Just open everything we can; ignore failures here */
1120                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1121                         continue;
1122
1123                 if (!latest_dev ||
1124                     device->generation > latest_dev->generation)
1125                         latest_dev = device;
1126         }
1127         if (fs_devices->open_devices == 0) {
1128                 ret = -EINVAL;
1129                 goto out;
1130         }
1131         fs_devices->opened = 1;
1132         fs_devices->latest_bdev = latest_dev->bdev;
1133         fs_devices->total_rw_bytes = 0;
1134 out:
1135         return ret;
1136 }
1137
1138 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1139 {
1140         struct btrfs_device *dev1, *dev2;
1141
1142         dev1 = list_entry(a, struct btrfs_device, dev_list);
1143         dev2 = list_entry(b, struct btrfs_device, dev_list);
1144
1145         if (dev1->devid < dev2->devid)
1146                 return -1;
1147         else if (dev1->devid > dev2->devid)
1148                 return 1;
1149         return 0;
1150 }
1151
1152 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1153                        fmode_t flags, void *holder)
1154 {
1155         int ret;
1156
1157         mutex_lock(&uuid_mutex);
1158         if (fs_devices->opened) {
1159                 fs_devices->opened++;
1160                 ret = 0;
1161         } else {
1162                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1163                 ret = open_fs_devices(fs_devices, flags, holder);
1164         }
1165         mutex_unlock(&uuid_mutex);
1166         return ret;
1167 }
1168
1169 static void btrfs_release_disk_super(struct page *page)
1170 {
1171         kunmap(page);
1172         put_page(page);
1173 }
1174
1175 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1176                                  struct page **page,
1177                                  struct btrfs_super_block **disk_super)
1178 {
1179         void *p;
1180         pgoff_t index;
1181
1182         /* make sure our super fits in the device */
1183         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1184                 return 1;
1185
1186         /* make sure our super fits in the page */
1187         if (sizeof(**disk_super) > PAGE_SIZE)
1188                 return 1;
1189
1190         /* make sure our super doesn't straddle pages on disk */
1191         index = bytenr >> PAGE_SHIFT;
1192         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1193                 return 1;
1194
1195         /* pull in the page with our super */
1196         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1197                                    index, GFP_KERNEL);
1198
1199         if (IS_ERR_OR_NULL(*page))
1200                 return 1;
1201
1202         p = kmap(*page);
1203
1204         /* align our pointer to the offset of the super block */
1205         *disk_super = p + (bytenr & ~PAGE_MASK);
1206
1207         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1208             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1209                 btrfs_release_disk_super(*page);
1210                 return 1;
1211         }
1212
1213         if ((*disk_super)->label[0] &&
1214                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1215                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1216
1217         return 0;
1218 }
1219
1220 /*
1221  * Look for a btrfs signature on a device. This may be called out of the mount path
1222  * and we are not allowed to call set_blocksize during the scan. The superblock
1223  * is read via pagecache
1224  */
1225 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1226                           struct btrfs_fs_devices **fs_devices_ret)
1227 {
1228         struct btrfs_super_block *disk_super;
1229         struct btrfs_device *device;
1230         struct block_device *bdev;
1231         struct page *page;
1232         int ret = 0;
1233         u64 bytenr;
1234
1235         /*
1236          * we would like to check all the supers, but that would make
1237          * a btrfs mount succeed after a mkfs from a different FS.
1238          * So, we need to add a special mount option to scan for
1239          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1240          */
1241         bytenr = btrfs_sb_offset(0);
1242         flags |= FMODE_EXCL;
1243         mutex_lock(&uuid_mutex);
1244
1245         bdev = blkdev_get_by_path(path, flags, holder);
1246         if (IS_ERR(bdev)) {
1247                 ret = PTR_ERR(bdev);
1248                 goto error;
1249         }
1250
1251         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1252                 ret = -EINVAL;
1253                 goto error_bdev_put;
1254         }
1255
1256         device = device_list_add(path, disk_super);
1257         if (IS_ERR(device))
1258                 ret = PTR_ERR(device);
1259         else
1260                 *fs_devices_ret = device->fs_devices;
1261
1262         btrfs_release_disk_super(page);
1263
1264 error_bdev_put:
1265         blkdev_put(bdev, flags);
1266 error:
1267         mutex_unlock(&uuid_mutex);
1268         return ret;
1269 }
1270
1271 /* helper to account the used device space in the range */
1272 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1273                                    u64 end, u64 *length)
1274 {
1275         struct btrfs_key key;
1276         struct btrfs_root *root = device->fs_info->dev_root;
1277         struct btrfs_dev_extent *dev_extent;
1278         struct btrfs_path *path;
1279         u64 extent_end;
1280         int ret;
1281         int slot;
1282         struct extent_buffer *l;
1283
1284         *length = 0;
1285
1286         if (start >= device->total_bytes ||
1287                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1288                 return 0;
1289
1290         path = btrfs_alloc_path();
1291         if (!path)
1292                 return -ENOMEM;
1293         path->reada = READA_FORWARD;
1294
1295         key.objectid = device->devid;
1296         key.offset = start;
1297         key.type = BTRFS_DEV_EXTENT_KEY;
1298
1299         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1300         if (ret < 0)
1301                 goto out;
1302         if (ret > 0) {
1303                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1304                 if (ret < 0)
1305                         goto out;
1306         }
1307
1308         while (1) {
1309                 l = path->nodes[0];
1310                 slot = path->slots[0];
1311                 if (slot >= btrfs_header_nritems(l)) {
1312                         ret = btrfs_next_leaf(root, path);
1313                         if (ret == 0)
1314                                 continue;
1315                         if (ret < 0)
1316                                 goto out;
1317
1318                         break;
1319                 }
1320                 btrfs_item_key_to_cpu(l, &key, slot);
1321
1322                 if (key.objectid < device->devid)
1323                         goto next;
1324
1325                 if (key.objectid > device->devid)
1326                         break;
1327
1328                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1329                         goto next;
1330
1331                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1332                 extent_end = key.offset + btrfs_dev_extent_length(l,
1333                                                                   dev_extent);
1334                 if (key.offset <= start && extent_end > end) {
1335                         *length = end - start + 1;
1336                         break;
1337                 } else if (key.offset <= start && extent_end > start)
1338                         *length += extent_end - start;
1339                 else if (key.offset > start && extent_end <= end)
1340                         *length += extent_end - key.offset;
1341                 else if (key.offset > start && key.offset <= end) {
1342                         *length += end - key.offset + 1;
1343                         break;
1344                 } else if (key.offset > end)
1345                         break;
1346
1347 next:
1348                 path->slots[0]++;
1349         }
1350         ret = 0;
1351 out:
1352         btrfs_free_path(path);
1353         return ret;
1354 }
1355
1356 static int contains_pending_extent(struct btrfs_transaction *transaction,
1357                                    struct btrfs_device *device,
1358                                    u64 *start, u64 len)
1359 {
1360         struct btrfs_fs_info *fs_info = device->fs_info;
1361         struct extent_map *em;
1362         struct list_head *search_list = &fs_info->pinned_chunks;
1363         int ret = 0;
1364         u64 physical_start = *start;
1365
1366         if (transaction)
1367                 search_list = &transaction->pending_chunks;
1368 again:
1369         list_for_each_entry(em, search_list, list) {
1370                 struct map_lookup *map;
1371                 int i;
1372
1373                 map = em->map_lookup;
1374                 for (i = 0; i < map->num_stripes; i++) {
1375                         u64 end;
1376
1377                         if (map->stripes[i].dev != device)
1378                                 continue;
1379                         if (map->stripes[i].physical >= physical_start + len ||
1380                             map->stripes[i].physical + em->orig_block_len <=
1381                             physical_start)
1382                                 continue;
1383                         /*
1384                          * Make sure that while processing the pinned list we do
1385                          * not override our *start with a lower value, because
1386                          * we can have pinned chunks that fall within this
1387                          * device hole and that have lower physical addresses
1388                          * than the pending chunks we processed before. If we
1389                          * do not take this special care we can end up getting
1390                          * 2 pending chunks that start at the same physical
1391                          * device offsets because the end offset of a pinned
1392                          * chunk can be equal to the start offset of some
1393                          * pending chunk.
1394                          */
1395                         end = map->stripes[i].physical + em->orig_block_len;
1396                         if (end > *start) {
1397                                 *start = end;
1398                                 ret = 1;
1399                         }
1400                 }
1401         }
1402         if (search_list != &fs_info->pinned_chunks) {
1403                 search_list = &fs_info->pinned_chunks;
1404                 goto again;
1405         }
1406
1407         return ret;
1408 }
1409
1410
1411 /*
1412  * find_free_dev_extent_start - find free space in the specified device
1413  * @device:       the device which we search the free space in
1414  * @num_bytes:    the size of the free space that we need
1415  * @search_start: the position from which to begin the search
1416  * @start:        store the start of the free space.
1417  * @len:          the size of the free space. that we find, or the size
1418  *                of the max free space if we don't find suitable free space
1419  *
1420  * this uses a pretty simple search, the expectation is that it is
1421  * called very infrequently and that a given device has a small number
1422  * of extents
1423  *
1424  * @start is used to store the start of the free space if we find. But if we
1425  * don't find suitable free space, it will be used to store the start position
1426  * of the max free space.
1427  *
1428  * @len is used to store the size of the free space that we find.
1429  * But if we don't find suitable free space, it is used to store the size of
1430  * the max free space.
1431  */
1432 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1433                                struct btrfs_device *device, u64 num_bytes,
1434                                u64 search_start, u64 *start, u64 *len)
1435 {
1436         struct btrfs_fs_info *fs_info = device->fs_info;
1437         struct btrfs_root *root = fs_info->dev_root;
1438         struct btrfs_key key;
1439         struct btrfs_dev_extent *dev_extent;
1440         struct btrfs_path *path;
1441         u64 hole_size;
1442         u64 max_hole_start;
1443         u64 max_hole_size;
1444         u64 extent_end;
1445         u64 search_end = device->total_bytes;
1446         int ret;
1447         int slot;
1448         struct extent_buffer *l;
1449
1450         /*
1451          * We don't want to overwrite the superblock on the drive nor any area
1452          * used by the boot loader (grub for example), so we make sure to start
1453          * at an offset of at least 1MB.
1454          */
1455         search_start = max_t(u64, search_start, SZ_1M);
1456
1457         path = btrfs_alloc_path();
1458         if (!path)
1459                 return -ENOMEM;
1460
1461         max_hole_start = search_start;
1462         max_hole_size = 0;
1463
1464 again:
1465         if (search_start >= search_end ||
1466                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1467                 ret = -ENOSPC;
1468                 goto out;
1469         }
1470
1471         path->reada = READA_FORWARD;
1472         path->search_commit_root = 1;
1473         path->skip_locking = 1;
1474
1475         key.objectid = device->devid;
1476         key.offset = search_start;
1477         key.type = BTRFS_DEV_EXTENT_KEY;
1478
1479         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1480         if (ret < 0)
1481                 goto out;
1482         if (ret > 0) {
1483                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1484                 if (ret < 0)
1485                         goto out;
1486         }
1487
1488         while (1) {
1489                 l = path->nodes[0];
1490                 slot = path->slots[0];
1491                 if (slot >= btrfs_header_nritems(l)) {
1492                         ret = btrfs_next_leaf(root, path);
1493                         if (ret == 0)
1494                                 continue;
1495                         if (ret < 0)
1496                                 goto out;
1497
1498                         break;
1499                 }
1500                 btrfs_item_key_to_cpu(l, &key, slot);
1501
1502                 if (key.objectid < device->devid)
1503                         goto next;
1504
1505                 if (key.objectid > device->devid)
1506                         break;
1507
1508                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1509                         goto next;
1510
1511                 if (key.offset > search_start) {
1512                         hole_size = key.offset - search_start;
1513
1514                         /*
1515                          * Have to check before we set max_hole_start, otherwise
1516                          * we could end up sending back this offset anyway.
1517                          */
1518                         if (contains_pending_extent(transaction, device,
1519                                                     &search_start,
1520                                                     hole_size)) {
1521                                 if (key.offset >= search_start) {
1522                                         hole_size = key.offset - search_start;
1523                                 } else {
1524                                         WARN_ON_ONCE(1);
1525                                         hole_size = 0;
1526                                 }
1527                         }
1528
1529                         if (hole_size > max_hole_size) {
1530                                 max_hole_start = search_start;
1531                                 max_hole_size = hole_size;
1532                         }
1533
1534                         /*
1535                          * If this free space is greater than which we need,
1536                          * it must be the max free space that we have found
1537                          * until now, so max_hole_start must point to the start
1538                          * of this free space and the length of this free space
1539                          * is stored in max_hole_size. Thus, we return
1540                          * max_hole_start and max_hole_size and go back to the
1541                          * caller.
1542                          */
1543                         if (hole_size >= num_bytes) {
1544                                 ret = 0;
1545                                 goto out;
1546                         }
1547                 }
1548
1549                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1550                 extent_end = key.offset + btrfs_dev_extent_length(l,
1551                                                                   dev_extent);
1552                 if (extent_end > search_start)
1553                         search_start = extent_end;
1554 next:
1555                 path->slots[0]++;
1556                 cond_resched();
1557         }
1558
1559         /*
1560          * At this point, search_start should be the end of
1561          * allocated dev extents, and when shrinking the device,
1562          * search_end may be smaller than search_start.
1563          */
1564         if (search_end > search_start) {
1565                 hole_size = search_end - search_start;
1566
1567                 if (contains_pending_extent(transaction, device, &search_start,
1568                                             hole_size)) {
1569                         btrfs_release_path(path);
1570                         goto again;
1571                 }
1572
1573                 if (hole_size > max_hole_size) {
1574                         max_hole_start = search_start;
1575                         max_hole_size = hole_size;
1576                 }
1577         }
1578
1579         /* See above. */
1580         if (max_hole_size < num_bytes)
1581                 ret = -ENOSPC;
1582         else
1583                 ret = 0;
1584
1585 out:
1586         btrfs_free_path(path);
1587         *start = max_hole_start;
1588         if (len)
1589                 *len = max_hole_size;
1590         return ret;
1591 }
1592
1593 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1594                          struct btrfs_device *device, u64 num_bytes,
1595                          u64 *start, u64 *len)
1596 {
1597         /* FIXME use last free of some kind */
1598         return find_free_dev_extent_start(trans->transaction, device,
1599                                           num_bytes, 0, start, len);
1600 }
1601
1602 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1603                           struct btrfs_device *device,
1604                           u64 start, u64 *dev_extent_len)
1605 {
1606         struct btrfs_fs_info *fs_info = device->fs_info;
1607         struct btrfs_root *root = fs_info->dev_root;
1608         int ret;
1609         struct btrfs_path *path;
1610         struct btrfs_key key;
1611         struct btrfs_key found_key;
1612         struct extent_buffer *leaf = NULL;
1613         struct btrfs_dev_extent *extent = NULL;
1614
1615         path = btrfs_alloc_path();
1616         if (!path)
1617                 return -ENOMEM;
1618
1619         key.objectid = device->devid;
1620         key.offset = start;
1621         key.type = BTRFS_DEV_EXTENT_KEY;
1622 again:
1623         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1624         if (ret > 0) {
1625                 ret = btrfs_previous_item(root, path, key.objectid,
1626                                           BTRFS_DEV_EXTENT_KEY);
1627                 if (ret)
1628                         goto out;
1629                 leaf = path->nodes[0];
1630                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1631                 extent = btrfs_item_ptr(leaf, path->slots[0],
1632                                         struct btrfs_dev_extent);
1633                 BUG_ON(found_key.offset > start || found_key.offset +
1634                        btrfs_dev_extent_length(leaf, extent) < start);
1635                 key = found_key;
1636                 btrfs_release_path(path);
1637                 goto again;
1638         } else if (ret == 0) {
1639                 leaf = path->nodes[0];
1640                 extent = btrfs_item_ptr(leaf, path->slots[0],
1641                                         struct btrfs_dev_extent);
1642         } else {
1643                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1644                 goto out;
1645         }
1646
1647         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1648
1649         ret = btrfs_del_item(trans, root, path);
1650         if (ret) {
1651                 btrfs_handle_fs_error(fs_info, ret,
1652                                       "Failed to remove dev extent item");
1653         } else {
1654                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1655         }
1656 out:
1657         btrfs_free_path(path);
1658         return ret;
1659 }
1660
1661 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1662                                   struct btrfs_device *device,
1663                                   u64 chunk_offset, u64 start, u64 num_bytes)
1664 {
1665         int ret;
1666         struct btrfs_path *path;
1667         struct btrfs_fs_info *fs_info = device->fs_info;
1668         struct btrfs_root *root = fs_info->dev_root;
1669         struct btrfs_dev_extent *extent;
1670         struct extent_buffer *leaf;
1671         struct btrfs_key key;
1672
1673         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1674         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1675         path = btrfs_alloc_path();
1676         if (!path)
1677                 return -ENOMEM;
1678
1679         key.objectid = device->devid;
1680         key.offset = start;
1681         key.type = BTRFS_DEV_EXTENT_KEY;
1682         ret = btrfs_insert_empty_item(trans, root, path, &key,
1683                                       sizeof(*extent));
1684         if (ret)
1685                 goto out;
1686
1687         leaf = path->nodes[0];
1688         extent = btrfs_item_ptr(leaf, path->slots[0],
1689                                 struct btrfs_dev_extent);
1690         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1691                                         BTRFS_CHUNK_TREE_OBJECTID);
1692         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1693                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1694         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1695
1696         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1697         btrfs_mark_buffer_dirty(leaf);
1698 out:
1699         btrfs_free_path(path);
1700         return ret;
1701 }
1702
1703 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1704 {
1705         struct extent_map_tree *em_tree;
1706         struct extent_map *em;
1707         struct rb_node *n;
1708         u64 ret = 0;
1709
1710         em_tree = &fs_info->mapping_tree.map_tree;
1711         read_lock(&em_tree->lock);
1712         n = rb_last(&em_tree->map);
1713         if (n) {
1714                 em = rb_entry(n, struct extent_map, rb_node);
1715                 ret = em->start + em->len;
1716         }
1717         read_unlock(&em_tree->lock);
1718
1719         return ret;
1720 }
1721
1722 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1723                                     u64 *devid_ret)
1724 {
1725         int ret;
1726         struct btrfs_key key;
1727         struct btrfs_key found_key;
1728         struct btrfs_path *path;
1729
1730         path = btrfs_alloc_path();
1731         if (!path)
1732                 return -ENOMEM;
1733
1734         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1735         key.type = BTRFS_DEV_ITEM_KEY;
1736         key.offset = (u64)-1;
1737
1738         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1739         if (ret < 0)
1740                 goto error;
1741
1742         BUG_ON(ret == 0); /* Corruption */
1743
1744         ret = btrfs_previous_item(fs_info->chunk_root, path,
1745                                   BTRFS_DEV_ITEMS_OBJECTID,
1746                                   BTRFS_DEV_ITEM_KEY);
1747         if (ret) {
1748                 *devid_ret = 1;
1749         } else {
1750                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1751                                       path->slots[0]);
1752                 *devid_ret = found_key.offset + 1;
1753         }
1754         ret = 0;
1755 error:
1756         btrfs_free_path(path);
1757         return ret;
1758 }
1759
1760 /*
1761  * the device information is stored in the chunk root
1762  * the btrfs_device struct should be fully filled in
1763  */
1764 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1765                             struct btrfs_fs_info *fs_info,
1766                             struct btrfs_device *device)
1767 {
1768         struct btrfs_root *root = fs_info->chunk_root;
1769         int ret;
1770         struct btrfs_path *path;
1771         struct btrfs_dev_item *dev_item;
1772         struct extent_buffer *leaf;
1773         struct btrfs_key key;
1774         unsigned long ptr;
1775
1776         path = btrfs_alloc_path();
1777         if (!path)
1778                 return -ENOMEM;
1779
1780         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1781         key.type = BTRFS_DEV_ITEM_KEY;
1782         key.offset = device->devid;
1783
1784         ret = btrfs_insert_empty_item(trans, root, path, &key,
1785                                       sizeof(*dev_item));
1786         if (ret)
1787                 goto out;
1788
1789         leaf = path->nodes[0];
1790         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1791
1792         btrfs_set_device_id(leaf, dev_item, device->devid);
1793         btrfs_set_device_generation(leaf, dev_item, 0);
1794         btrfs_set_device_type(leaf, dev_item, device->type);
1795         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1796         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1797         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1798         btrfs_set_device_total_bytes(leaf, dev_item,
1799                                      btrfs_device_get_disk_total_bytes(device));
1800         btrfs_set_device_bytes_used(leaf, dev_item,
1801                                     btrfs_device_get_bytes_used(device));
1802         btrfs_set_device_group(leaf, dev_item, 0);
1803         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1804         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1805         btrfs_set_device_start_offset(leaf, dev_item, 0);
1806
1807         ptr = btrfs_device_uuid(dev_item);
1808         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1809         ptr = btrfs_device_fsid(dev_item);
1810         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1811         btrfs_mark_buffer_dirty(leaf);
1812
1813         ret = 0;
1814 out:
1815         btrfs_free_path(path);
1816         return ret;
1817 }
1818
1819 /*
1820  * Function to update ctime/mtime for a given device path.
1821  * Mainly used for ctime/mtime based probe like libblkid.
1822  */
1823 static void update_dev_time(const char *path_name)
1824 {
1825         struct file *filp;
1826
1827         filp = filp_open(path_name, O_RDWR, 0);
1828         if (IS_ERR(filp))
1829                 return;
1830         file_update_time(filp);
1831         filp_close(filp, NULL);
1832 }
1833
1834 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1835                              struct btrfs_device *device)
1836 {
1837         struct btrfs_root *root = fs_info->chunk_root;
1838         int ret;
1839         struct btrfs_path *path;
1840         struct btrfs_key key;
1841         struct btrfs_trans_handle *trans;
1842
1843         path = btrfs_alloc_path();
1844         if (!path)
1845                 return -ENOMEM;
1846
1847         trans = btrfs_start_transaction(root, 0);
1848         if (IS_ERR(trans)) {
1849                 btrfs_free_path(path);
1850                 return PTR_ERR(trans);
1851         }
1852         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1853         key.type = BTRFS_DEV_ITEM_KEY;
1854         key.offset = device->devid;
1855
1856         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1857         if (ret) {
1858                 if (ret > 0)
1859                         ret = -ENOENT;
1860                 btrfs_abort_transaction(trans, ret);
1861                 btrfs_end_transaction(trans);
1862                 goto out;
1863         }
1864
1865         ret = btrfs_del_item(trans, root, path);
1866         if (ret) {
1867                 btrfs_abort_transaction(trans, ret);
1868                 btrfs_end_transaction(trans);
1869         }
1870
1871 out:
1872         btrfs_free_path(path);
1873         if (!ret)
1874                 ret = btrfs_commit_transaction(trans);
1875         return ret;
1876 }
1877
1878 /*
1879  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1880  * filesystem. It's up to the caller to adjust that number regarding eg. device
1881  * replace.
1882  */
1883 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1884                 u64 num_devices)
1885 {
1886         u64 all_avail;
1887         unsigned seq;
1888         int i;
1889
1890         do {
1891                 seq = read_seqbegin(&fs_info->profiles_lock);
1892
1893                 all_avail = fs_info->avail_data_alloc_bits |
1894                             fs_info->avail_system_alloc_bits |
1895                             fs_info->avail_metadata_alloc_bits;
1896         } while (read_seqretry(&fs_info->profiles_lock, seq));
1897
1898         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1899                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1900                         continue;
1901
1902                 if (num_devices < btrfs_raid_array[i].devs_min) {
1903                         int ret = btrfs_raid_mindev_error[i];
1904
1905                         if (ret)
1906                                 return ret;
1907                 }
1908         }
1909
1910         return 0;
1911 }
1912
1913 static struct btrfs_device * btrfs_find_next_active_device(
1914                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1915 {
1916         struct btrfs_device *next_device;
1917
1918         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1919                 if (next_device != device &&
1920                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1921                     && next_device->bdev)
1922                         return next_device;
1923         }
1924
1925         return NULL;
1926 }
1927
1928 /*
1929  * Helper function to check if the given device is part of s_bdev / latest_bdev
1930  * and replace it with the provided or the next active device, in the context
1931  * where this function called, there should be always be another device (or
1932  * this_dev) which is active.
1933  */
1934 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1935                 struct btrfs_device *device, struct btrfs_device *this_dev)
1936 {
1937         struct btrfs_device *next_device;
1938
1939         if (this_dev)
1940                 next_device = this_dev;
1941         else
1942                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1943                                                                 device);
1944         ASSERT(next_device);
1945
1946         if (fs_info->sb->s_bdev &&
1947                         (fs_info->sb->s_bdev == device->bdev))
1948                 fs_info->sb->s_bdev = next_device->bdev;
1949
1950         if (fs_info->fs_devices->latest_bdev == device->bdev)
1951                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1952 }
1953
1954 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1955                 u64 devid)
1956 {
1957         struct btrfs_device *device;
1958         struct btrfs_fs_devices *cur_devices;
1959         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1960         u64 num_devices;
1961         int ret = 0;
1962
1963         mutex_lock(&uuid_mutex);
1964
1965         num_devices = fs_devices->num_devices;
1966         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1967         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1968                 WARN_ON(num_devices < 1);
1969                 num_devices--;
1970         }
1971         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1972
1973         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1974         if (ret)
1975                 goto out;
1976
1977         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1978                                            &device);
1979         if (ret)
1980                 goto out;
1981
1982         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1983                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1984                 goto out;
1985         }
1986
1987         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1988             fs_info->fs_devices->rw_devices == 1) {
1989                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1990                 goto out;
1991         }
1992
1993         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1994                 mutex_lock(&fs_info->chunk_mutex);
1995                 list_del_init(&device->dev_alloc_list);
1996                 device->fs_devices->rw_devices--;
1997                 mutex_unlock(&fs_info->chunk_mutex);
1998         }
1999
2000         mutex_unlock(&uuid_mutex);
2001         ret = btrfs_shrink_device(device, 0);
2002         mutex_lock(&uuid_mutex);
2003         if (ret)
2004                 goto error_undo;
2005
2006         /*
2007          * TODO: the superblock still includes this device in its num_devices
2008          * counter although write_all_supers() is not locked out. This
2009          * could give a filesystem state which requires a degraded mount.
2010          */
2011         ret = btrfs_rm_dev_item(fs_info, device);
2012         if (ret)
2013                 goto error_undo;
2014
2015         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2016         btrfs_scrub_cancel_dev(fs_info, device);
2017
2018         /*
2019          * the device list mutex makes sure that we don't change
2020          * the device list while someone else is writing out all
2021          * the device supers. Whoever is writing all supers, should
2022          * lock the device list mutex before getting the number of
2023          * devices in the super block (super_copy). Conversely,
2024          * whoever updates the number of devices in the super block
2025          * (super_copy) should hold the device list mutex.
2026          */
2027
2028         cur_devices = device->fs_devices;
2029         mutex_lock(&fs_devices->device_list_mutex);
2030         list_del_rcu(&device->dev_list);
2031
2032         device->fs_devices->num_devices--;
2033         device->fs_devices->total_devices--;
2034
2035         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2036                 device->fs_devices->missing_devices--;
2037
2038         btrfs_assign_next_active_device(fs_info, device, NULL);
2039
2040         if (device->bdev) {
2041                 device->fs_devices->open_devices--;
2042                 /* remove sysfs entry */
2043                 btrfs_sysfs_rm_device_link(fs_devices, device);
2044         }
2045
2046         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2047         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2048         mutex_unlock(&fs_devices->device_list_mutex);
2049
2050         /*
2051          * at this point, the device is zero sized and detached from
2052          * the devices list.  All that's left is to zero out the old
2053          * supers and free the device.
2054          */
2055         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2056                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2057
2058         btrfs_close_bdev(device);
2059         call_rcu(&device->rcu, free_device_rcu);
2060
2061         if (cur_devices->open_devices == 0) {
2062                 while (fs_devices) {
2063                         if (fs_devices->seed == cur_devices) {
2064                                 fs_devices->seed = cur_devices->seed;
2065                                 break;
2066                         }
2067                         fs_devices = fs_devices->seed;
2068                 }
2069                 cur_devices->seed = NULL;
2070                 close_fs_devices(cur_devices);
2071                 free_fs_devices(cur_devices);
2072         }
2073
2074 out:
2075         mutex_unlock(&uuid_mutex);
2076         return ret;
2077
2078 error_undo:
2079         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2080                 mutex_lock(&fs_info->chunk_mutex);
2081                 list_add(&device->dev_alloc_list,
2082                          &fs_devices->alloc_list);
2083                 device->fs_devices->rw_devices++;
2084                 mutex_unlock(&fs_info->chunk_mutex);
2085         }
2086         goto out;
2087 }
2088
2089 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
2090                                         struct btrfs_device *srcdev)
2091 {
2092         struct btrfs_fs_devices *fs_devices;
2093
2094         lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2095
2096         /*
2097          * in case of fs with no seed, srcdev->fs_devices will point
2098          * to fs_devices of fs_info. However when the dev being replaced is
2099          * a seed dev it will point to the seed's local fs_devices. In short
2100          * srcdev will have its correct fs_devices in both the cases.
2101          */
2102         fs_devices = srcdev->fs_devices;
2103
2104         list_del_rcu(&srcdev->dev_list);
2105         list_del(&srcdev->dev_alloc_list);
2106         fs_devices->num_devices--;
2107         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2108                 fs_devices->missing_devices--;
2109
2110         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2111                 fs_devices->rw_devices--;
2112
2113         if (srcdev->bdev)
2114                 fs_devices->open_devices--;
2115 }
2116
2117 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2118                                       struct btrfs_device *srcdev)
2119 {
2120         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2121
2122         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2123                 /* zero out the old super if it is writable */
2124                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2125         }
2126
2127         btrfs_close_bdev(srcdev);
2128         call_rcu(&srcdev->rcu, free_device_rcu);
2129
2130         /* if this is no devs we rather delete the fs_devices */
2131         if (!fs_devices->num_devices) {
2132                 struct btrfs_fs_devices *tmp_fs_devices;
2133
2134                 /*
2135                  * On a mounted FS, num_devices can't be zero unless it's a
2136                  * seed. In case of a seed device being replaced, the replace
2137                  * target added to the sprout FS, so there will be no more
2138                  * device left under the seed FS.
2139                  */
2140                 ASSERT(fs_devices->seeding);
2141
2142                 tmp_fs_devices = fs_info->fs_devices;
2143                 while (tmp_fs_devices) {
2144                         if (tmp_fs_devices->seed == fs_devices) {
2145                                 tmp_fs_devices->seed = fs_devices->seed;
2146                                 break;
2147                         }
2148                         tmp_fs_devices = tmp_fs_devices->seed;
2149                 }
2150                 fs_devices->seed = NULL;
2151                 close_fs_devices(fs_devices);
2152                 free_fs_devices(fs_devices);
2153         }
2154 }
2155
2156 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2157                                       struct btrfs_device *tgtdev)
2158 {
2159         mutex_lock(&uuid_mutex);
2160         WARN_ON(!tgtdev);
2161         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2162
2163         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2164
2165         if (tgtdev->bdev)
2166                 fs_info->fs_devices->open_devices--;
2167
2168         fs_info->fs_devices->num_devices--;
2169
2170         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2171
2172         list_del_rcu(&tgtdev->dev_list);
2173
2174         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2175         mutex_unlock(&uuid_mutex);
2176
2177         /*
2178          * The update_dev_time() with in btrfs_scratch_superblocks()
2179          * may lead to a call to btrfs_show_devname() which will try
2180          * to hold device_list_mutex. And here this device
2181          * is already out of device list, so we don't have to hold
2182          * the device_list_mutex lock.
2183          */
2184         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2185
2186         btrfs_close_bdev(tgtdev);
2187         call_rcu(&tgtdev->rcu, free_device_rcu);
2188 }
2189
2190 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2191                                      const char *device_path,
2192                                      struct btrfs_device **device)
2193 {
2194         int ret = 0;
2195         struct btrfs_super_block *disk_super;
2196         u64 devid;
2197         u8 *dev_uuid;
2198         struct block_device *bdev;
2199         struct buffer_head *bh;
2200
2201         *device = NULL;
2202         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2203                                     fs_info->bdev_holder, 0, &bdev, &bh);
2204         if (ret)
2205                 return ret;
2206         disk_super = (struct btrfs_super_block *)bh->b_data;
2207         devid = btrfs_stack_device_id(&disk_super->dev_item);
2208         dev_uuid = disk_super->dev_item.uuid;
2209         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2210         brelse(bh);
2211         if (!*device)
2212                 ret = -ENOENT;
2213         blkdev_put(bdev, FMODE_READ);
2214         return ret;
2215 }
2216
2217 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2218                                          const char *device_path,
2219                                          struct btrfs_device **device)
2220 {
2221         *device = NULL;
2222         if (strcmp(device_path, "missing") == 0) {
2223                 struct list_head *devices;
2224                 struct btrfs_device *tmp;
2225
2226                 devices = &fs_info->fs_devices->devices;
2227                 list_for_each_entry(tmp, devices, dev_list) {
2228                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2229                                         &tmp->dev_state) && !tmp->bdev) {
2230                                 *device = tmp;
2231                                 break;
2232                         }
2233                 }
2234
2235                 if (!*device)
2236                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2237
2238                 return 0;
2239         } else {
2240                 return btrfs_find_device_by_path(fs_info, device_path, device);
2241         }
2242 }
2243
2244 /*
2245  * Lookup a device given by device id, or the path if the id is 0.
2246  */
2247 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2248                                  const char *devpath,
2249                                  struct btrfs_device **device)
2250 {
2251         int ret;
2252
2253         if (devid) {
2254                 ret = 0;
2255                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2256                 if (!*device)
2257                         ret = -ENOENT;
2258         } else {
2259                 if (!devpath || !devpath[0])
2260                         return -EINVAL;
2261
2262                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2263                                                            device);
2264         }
2265         return ret;
2266 }
2267
2268 /*
2269  * does all the dirty work required for changing file system's UUID.
2270  */
2271 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2272 {
2273         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2274         struct btrfs_fs_devices *old_devices;
2275         struct btrfs_fs_devices *seed_devices;
2276         struct btrfs_super_block *disk_super = fs_info->super_copy;
2277         struct btrfs_device *device;
2278         u64 super_flags;
2279
2280         lockdep_assert_held(&uuid_mutex);
2281         if (!fs_devices->seeding)
2282                 return -EINVAL;
2283
2284         seed_devices = alloc_fs_devices(NULL);
2285         if (IS_ERR(seed_devices))
2286                 return PTR_ERR(seed_devices);
2287
2288         old_devices = clone_fs_devices(fs_devices);
2289         if (IS_ERR(old_devices)) {
2290                 kfree(seed_devices);
2291                 return PTR_ERR(old_devices);
2292         }
2293
2294         list_add(&old_devices->fs_list, &fs_uuids);
2295
2296         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2297         seed_devices->opened = 1;
2298         INIT_LIST_HEAD(&seed_devices->devices);
2299         INIT_LIST_HEAD(&seed_devices->alloc_list);
2300         mutex_init(&seed_devices->device_list_mutex);
2301
2302         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2303         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2304                               synchronize_rcu);
2305         list_for_each_entry(device, &seed_devices->devices, dev_list)
2306                 device->fs_devices = seed_devices;
2307
2308         mutex_lock(&fs_info->chunk_mutex);
2309         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2310         mutex_unlock(&fs_info->chunk_mutex);
2311
2312         fs_devices->seeding = 0;
2313         fs_devices->num_devices = 0;
2314         fs_devices->open_devices = 0;
2315         fs_devices->missing_devices = 0;
2316         fs_devices->rotating = 0;
2317         fs_devices->seed = seed_devices;
2318
2319         generate_random_uuid(fs_devices->fsid);
2320         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2321         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2322         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2323
2324         super_flags = btrfs_super_flags(disk_super) &
2325                       ~BTRFS_SUPER_FLAG_SEEDING;
2326         btrfs_set_super_flags(disk_super, super_flags);
2327
2328         return 0;
2329 }
2330
2331 /*
2332  * Store the expected generation for seed devices in device items.
2333  */
2334 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2335                                struct btrfs_fs_info *fs_info)
2336 {
2337         struct btrfs_root *root = fs_info->chunk_root;
2338         struct btrfs_path *path;
2339         struct extent_buffer *leaf;
2340         struct btrfs_dev_item *dev_item;
2341         struct btrfs_device *device;
2342         struct btrfs_key key;
2343         u8 fs_uuid[BTRFS_FSID_SIZE];
2344         u8 dev_uuid[BTRFS_UUID_SIZE];
2345         u64 devid;
2346         int ret;
2347
2348         path = btrfs_alloc_path();
2349         if (!path)
2350                 return -ENOMEM;
2351
2352         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2353         key.offset = 0;
2354         key.type = BTRFS_DEV_ITEM_KEY;
2355
2356         while (1) {
2357                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2358                 if (ret < 0)
2359                         goto error;
2360
2361                 leaf = path->nodes[0];
2362 next_slot:
2363                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2364                         ret = btrfs_next_leaf(root, path);
2365                         if (ret > 0)
2366                                 break;
2367                         if (ret < 0)
2368                                 goto error;
2369                         leaf = path->nodes[0];
2370                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2371                         btrfs_release_path(path);
2372                         continue;
2373                 }
2374
2375                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2376                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2377                     key.type != BTRFS_DEV_ITEM_KEY)
2378                         break;
2379
2380                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2381                                           struct btrfs_dev_item);
2382                 devid = btrfs_device_id(leaf, dev_item);
2383                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2384                                    BTRFS_UUID_SIZE);
2385                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2386                                    BTRFS_FSID_SIZE);
2387                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2388                 BUG_ON(!device); /* Logic error */
2389
2390                 if (device->fs_devices->seeding) {
2391                         btrfs_set_device_generation(leaf, dev_item,
2392                                                     device->generation);
2393                         btrfs_mark_buffer_dirty(leaf);
2394                 }
2395
2396                 path->slots[0]++;
2397                 goto next_slot;
2398         }
2399         ret = 0;
2400 error:
2401         btrfs_free_path(path);
2402         return ret;
2403 }
2404
2405 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2406 {
2407         struct btrfs_root *root = fs_info->dev_root;
2408         struct request_queue *q;
2409         struct btrfs_trans_handle *trans;
2410         struct btrfs_device *device;
2411         struct block_device *bdev;
2412         struct list_head *devices;
2413         struct super_block *sb = fs_info->sb;
2414         struct rcu_string *name;
2415         u64 tmp;
2416         int seeding_dev = 0;
2417         int ret = 0;
2418         bool unlocked = false;
2419
2420         if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2421                 return -EROFS;
2422
2423         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2424                                   fs_info->bdev_holder);
2425         if (IS_ERR(bdev))
2426                 return PTR_ERR(bdev);
2427
2428         if (fs_info->fs_devices->seeding) {
2429                 seeding_dev = 1;
2430                 down_write(&sb->s_umount);
2431                 mutex_lock(&uuid_mutex);
2432         }
2433
2434         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2435
2436         devices = &fs_info->fs_devices->devices;
2437
2438         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2439         list_for_each_entry(device, devices, dev_list) {
2440                 if (device->bdev == bdev) {
2441                         ret = -EEXIST;
2442                         mutex_unlock(
2443                                 &fs_info->fs_devices->device_list_mutex);
2444                         goto error;
2445                 }
2446         }
2447         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2448
2449         device = btrfs_alloc_device(fs_info, NULL, NULL);
2450         if (IS_ERR(device)) {
2451                 /* we can safely leave the fs_devices entry around */
2452                 ret = PTR_ERR(device);
2453                 goto error;
2454         }
2455
2456         name = rcu_string_strdup(device_path, GFP_KERNEL);
2457         if (!name) {
2458                 ret = -ENOMEM;
2459                 goto error_free_device;
2460         }
2461         rcu_assign_pointer(device->name, name);
2462
2463         trans = btrfs_start_transaction(root, 0);
2464         if (IS_ERR(trans)) {
2465                 ret = PTR_ERR(trans);
2466                 goto error_free_device;
2467         }
2468
2469         q = bdev_get_queue(bdev);
2470         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2471         device->generation = trans->transid;
2472         device->io_width = fs_info->sectorsize;
2473         device->io_align = fs_info->sectorsize;
2474         device->sector_size = fs_info->sectorsize;
2475         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2476                                          fs_info->sectorsize);
2477         device->disk_total_bytes = device->total_bytes;
2478         device->commit_total_bytes = device->total_bytes;
2479         device->fs_info = fs_info;
2480         device->bdev = bdev;
2481         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2482         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2483         device->mode = FMODE_EXCL;
2484         device->dev_stats_valid = 1;
2485         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2486
2487         if (seeding_dev) {
2488                 sb->s_flags &= ~SB_RDONLY;
2489                 ret = btrfs_prepare_sprout(fs_info);
2490                 if (ret) {
2491                         btrfs_abort_transaction(trans, ret);
2492                         goto error_trans;
2493                 }
2494         }
2495
2496         device->fs_devices = fs_info->fs_devices;
2497
2498         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2499         mutex_lock(&fs_info->chunk_mutex);
2500         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2501         list_add(&device->dev_alloc_list,
2502                  &fs_info->fs_devices->alloc_list);
2503         fs_info->fs_devices->num_devices++;
2504         fs_info->fs_devices->open_devices++;
2505         fs_info->fs_devices->rw_devices++;
2506         fs_info->fs_devices->total_devices++;
2507         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2508
2509         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2510
2511         if (!blk_queue_nonrot(q))
2512                 fs_info->fs_devices->rotating = 1;
2513
2514         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2515         btrfs_set_super_total_bytes(fs_info->super_copy,
2516                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2517
2518         tmp = btrfs_super_num_devices(fs_info->super_copy);
2519         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2520
2521         /* add sysfs device entry */
2522         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2523
2524         /*
2525          * we've got more storage, clear any full flags on the space
2526          * infos
2527          */
2528         btrfs_clear_space_info_full(fs_info);
2529
2530         mutex_unlock(&fs_info->chunk_mutex);
2531         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2532
2533         if (seeding_dev) {
2534                 mutex_lock(&fs_info->chunk_mutex);
2535                 ret = init_first_rw_device(trans, fs_info);
2536                 mutex_unlock(&fs_info->chunk_mutex);
2537                 if (ret) {
2538                         btrfs_abort_transaction(trans, ret);
2539                         goto error_sysfs;
2540                 }
2541         }
2542
2543         ret = btrfs_add_dev_item(trans, fs_info, device);
2544         if (ret) {
2545                 btrfs_abort_transaction(trans, ret);
2546                 goto error_sysfs;
2547         }
2548
2549         if (seeding_dev) {
2550                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2551
2552                 ret = btrfs_finish_sprout(trans, fs_info);
2553                 if (ret) {
2554                         btrfs_abort_transaction(trans, ret);
2555                         goto error_sysfs;
2556                 }
2557
2558                 /* Sprouting would change fsid of the mounted root,
2559                  * so rename the fsid on the sysfs
2560                  */
2561                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2562                                                 fs_info->fsid);
2563                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2564                         btrfs_warn(fs_info,
2565                                    "sysfs: failed to create fsid for sprout");
2566         }
2567
2568         ret = btrfs_commit_transaction(trans);
2569
2570         if (seeding_dev) {
2571                 mutex_unlock(&uuid_mutex);
2572                 up_write(&sb->s_umount);
2573                 unlocked = true;
2574
2575                 if (ret) /* transaction commit */
2576                         return ret;
2577
2578                 ret = btrfs_relocate_sys_chunks(fs_info);
2579                 if (ret < 0)
2580                         btrfs_handle_fs_error(fs_info, ret,
2581                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2582                 trans = btrfs_attach_transaction(root);
2583                 if (IS_ERR(trans)) {
2584                         if (PTR_ERR(trans) == -ENOENT)
2585                                 return 0;
2586                         ret = PTR_ERR(trans);
2587                         trans = NULL;
2588                         goto error_sysfs;
2589                 }
2590                 ret = btrfs_commit_transaction(trans);
2591         }
2592
2593         /* Update ctime/mtime for libblkid */
2594         update_dev_time(device_path);
2595         return ret;
2596
2597 error_sysfs:
2598         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2599 error_trans:
2600         if (seeding_dev)
2601                 sb->s_flags |= SB_RDONLY;
2602         if (trans)
2603                 btrfs_end_transaction(trans);
2604 error_free_device:
2605         btrfs_free_device(device);
2606 error:
2607         blkdev_put(bdev, FMODE_EXCL);
2608         if (seeding_dev && !unlocked) {
2609                 mutex_unlock(&uuid_mutex);
2610                 up_write(&sb->s_umount);
2611         }
2612         return ret;
2613 }
2614
2615 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2616                                         struct btrfs_device *device)
2617 {
2618         int ret;
2619         struct btrfs_path *path;
2620         struct btrfs_root *root = device->fs_info->chunk_root;
2621         struct btrfs_dev_item *dev_item;
2622         struct extent_buffer *leaf;
2623         struct btrfs_key key;
2624
2625         path = btrfs_alloc_path();
2626         if (!path)
2627                 return -ENOMEM;
2628
2629         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2630         key.type = BTRFS_DEV_ITEM_KEY;
2631         key.offset = device->devid;
2632
2633         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2634         if (ret < 0)
2635                 goto out;
2636
2637         if (ret > 0) {
2638                 ret = -ENOENT;
2639                 goto out;
2640         }
2641
2642         leaf = path->nodes[0];
2643         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2644
2645         btrfs_set_device_id(leaf, dev_item, device->devid);
2646         btrfs_set_device_type(leaf, dev_item, device->type);
2647         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2648         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2649         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2650         btrfs_set_device_total_bytes(leaf, dev_item,
2651                                      btrfs_device_get_disk_total_bytes(device));
2652         btrfs_set_device_bytes_used(leaf, dev_item,
2653                                     btrfs_device_get_bytes_used(device));
2654         btrfs_mark_buffer_dirty(leaf);
2655
2656 out:
2657         btrfs_free_path(path);
2658         return ret;
2659 }
2660
2661 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2662                       struct btrfs_device *device, u64 new_size)
2663 {
2664         struct btrfs_fs_info *fs_info = device->fs_info;
2665         struct btrfs_super_block *super_copy = fs_info->super_copy;
2666         struct btrfs_fs_devices *fs_devices;
2667         u64 old_total;
2668         u64 diff;
2669
2670         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2671                 return -EACCES;
2672
2673         new_size = round_down(new_size, fs_info->sectorsize);
2674
2675         mutex_lock(&fs_info->chunk_mutex);
2676         old_total = btrfs_super_total_bytes(super_copy);
2677         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2678
2679         if (new_size <= device->total_bytes ||
2680             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2681                 mutex_unlock(&fs_info->chunk_mutex);
2682                 return -EINVAL;
2683         }
2684
2685         fs_devices = fs_info->fs_devices;
2686
2687         btrfs_set_super_total_bytes(super_copy,
2688                         round_down(old_total + diff, fs_info->sectorsize));
2689         device->fs_devices->total_rw_bytes += diff;
2690
2691         btrfs_device_set_total_bytes(device, new_size);
2692         btrfs_device_set_disk_total_bytes(device, new_size);
2693         btrfs_clear_space_info_full(device->fs_info);
2694         if (list_empty(&device->resized_list))
2695                 list_add_tail(&device->resized_list,
2696                               &fs_devices->resized_devices);
2697         mutex_unlock(&fs_info->chunk_mutex);
2698
2699         return btrfs_update_device(trans, device);
2700 }
2701
2702 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2703                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2704 {
2705         struct btrfs_root *root = fs_info->chunk_root;
2706         int ret;
2707         struct btrfs_path *path;
2708         struct btrfs_key key;
2709
2710         path = btrfs_alloc_path();
2711         if (!path)
2712                 return -ENOMEM;
2713
2714         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2715         key.offset = chunk_offset;
2716         key.type = BTRFS_CHUNK_ITEM_KEY;
2717
2718         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2719         if (ret < 0)
2720                 goto out;
2721         else if (ret > 0) { /* Logic error or corruption */
2722                 btrfs_handle_fs_error(fs_info, -ENOENT,
2723                                       "Failed lookup while freeing chunk.");
2724                 ret = -ENOENT;
2725                 goto out;
2726         }
2727
2728         ret = btrfs_del_item(trans, root, path);
2729         if (ret < 0)
2730                 btrfs_handle_fs_error(fs_info, ret,
2731                                       "Failed to delete chunk item.");
2732 out:
2733         btrfs_free_path(path);
2734         return ret;
2735 }
2736
2737 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2738 {
2739         struct btrfs_super_block *super_copy = fs_info->super_copy;
2740         struct btrfs_disk_key *disk_key;
2741         struct btrfs_chunk *chunk;
2742         u8 *ptr;
2743         int ret = 0;
2744         u32 num_stripes;
2745         u32 array_size;
2746         u32 len = 0;
2747         u32 cur;
2748         struct btrfs_key key;
2749
2750         mutex_lock(&fs_info->chunk_mutex);
2751         array_size = btrfs_super_sys_array_size(super_copy);
2752
2753         ptr = super_copy->sys_chunk_array;
2754         cur = 0;
2755
2756         while (cur < array_size) {
2757                 disk_key = (struct btrfs_disk_key *)ptr;
2758                 btrfs_disk_key_to_cpu(&key, disk_key);
2759
2760                 len = sizeof(*disk_key);
2761
2762                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2763                         chunk = (struct btrfs_chunk *)(ptr + len);
2764                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2765                         len += btrfs_chunk_item_size(num_stripes);
2766                 } else {
2767                         ret = -EIO;
2768                         break;
2769                 }
2770                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2771                     key.offset == chunk_offset) {
2772                         memmove(ptr, ptr + len, array_size - (cur + len));
2773                         array_size -= len;
2774                         btrfs_set_super_sys_array_size(super_copy, array_size);
2775                 } else {
2776                         ptr += len;
2777                         cur += len;
2778                 }
2779         }
2780         mutex_unlock(&fs_info->chunk_mutex);
2781         return ret;
2782 }
2783
2784 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2785                                         u64 logical, u64 length)
2786 {
2787         struct extent_map_tree *em_tree;
2788         struct extent_map *em;
2789
2790         em_tree = &fs_info->mapping_tree.map_tree;
2791         read_lock(&em_tree->lock);
2792         em = lookup_extent_mapping(em_tree, logical, length);
2793         read_unlock(&em_tree->lock);
2794
2795         if (!em) {
2796                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2797                            logical, length);
2798                 return ERR_PTR(-EINVAL);
2799         }
2800
2801         if (em->start > logical || em->start + em->len < logical) {
2802                 btrfs_crit(fs_info,
2803                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2804                            logical, length, em->start, em->start + em->len);
2805                 free_extent_map(em);
2806                 return ERR_PTR(-EINVAL);
2807         }
2808
2809         /* callers are responsible for dropping em's ref. */
2810         return em;
2811 }
2812
2813 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2814                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2815 {
2816         struct extent_map *em;
2817         struct map_lookup *map;
2818         u64 dev_extent_len = 0;
2819         int i, ret = 0;
2820         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2821
2822         em = get_chunk_map(fs_info, chunk_offset, 1);
2823         if (IS_ERR(em)) {
2824                 /*
2825                  * This is a logic error, but we don't want to just rely on the
2826                  * user having built with ASSERT enabled, so if ASSERT doesn't
2827                  * do anything we still error out.
2828                  */
2829                 ASSERT(0);
2830                 return PTR_ERR(em);
2831         }
2832         map = em->map_lookup;
2833         mutex_lock(&fs_info->chunk_mutex);
2834         check_system_chunk(trans, fs_info, map->type);
2835         mutex_unlock(&fs_info->chunk_mutex);
2836
2837         /*
2838          * Take the device list mutex to prevent races with the final phase of
2839          * a device replace operation that replaces the device object associated
2840          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2841          */
2842         mutex_lock(&fs_devices->device_list_mutex);
2843         for (i = 0; i < map->num_stripes; i++) {
2844                 struct btrfs_device *device = map->stripes[i].dev;
2845                 ret = btrfs_free_dev_extent(trans, device,
2846                                             map->stripes[i].physical,
2847                                             &dev_extent_len);
2848                 if (ret) {
2849                         mutex_unlock(&fs_devices->device_list_mutex);
2850                         btrfs_abort_transaction(trans, ret);
2851                         goto out;
2852                 }
2853
2854                 if (device->bytes_used > 0) {
2855                         mutex_lock(&fs_info->chunk_mutex);
2856                         btrfs_device_set_bytes_used(device,
2857                                         device->bytes_used - dev_extent_len);
2858                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2859                         btrfs_clear_space_info_full(fs_info);
2860                         mutex_unlock(&fs_info->chunk_mutex);
2861                 }
2862
2863                 if (map->stripes[i].dev) {
2864                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2865                         if (ret) {
2866                                 mutex_unlock(&fs_devices->device_list_mutex);
2867                                 btrfs_abort_transaction(trans, ret);
2868                                 goto out;
2869                         }
2870                 }
2871         }
2872         mutex_unlock(&fs_devices->device_list_mutex);
2873
2874         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2875         if (ret) {
2876                 btrfs_abort_transaction(trans, ret);
2877                 goto out;
2878         }
2879
2880         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2881
2882         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2883                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2884                 if (ret) {
2885                         btrfs_abort_transaction(trans, ret);
2886                         goto out;
2887                 }
2888         }
2889
2890         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2891         if (ret) {
2892                 btrfs_abort_transaction(trans, ret);
2893                 goto out;
2894         }
2895
2896 out:
2897         /* once for us */
2898         free_extent_map(em);
2899         return ret;
2900 }
2901
2902 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2903 {
2904         struct btrfs_root *root = fs_info->chunk_root;
2905         struct btrfs_trans_handle *trans;
2906         int ret;
2907
2908         /*
2909          * Prevent races with automatic removal of unused block groups.
2910          * After we relocate and before we remove the chunk with offset
2911          * chunk_offset, automatic removal of the block group can kick in,
2912          * resulting in a failure when calling btrfs_remove_chunk() below.
2913          *
2914          * Make sure to acquire this mutex before doing a tree search (dev
2915          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2916          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2917          * we release the path used to search the chunk/dev tree and before
2918          * the current task acquires this mutex and calls us.
2919          */
2920         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2921
2922         ret = btrfs_can_relocate(fs_info, chunk_offset);
2923         if (ret)
2924                 return -ENOSPC;
2925
2926         /* step one, relocate all the extents inside this chunk */
2927         btrfs_scrub_pause(fs_info);
2928         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2929         btrfs_scrub_continue(fs_info);
2930         if (ret)
2931                 return ret;
2932
2933         /*
2934          * We add the kobjects here (and after forcing data chunk creation)
2935          * since relocation is the only place we'll create chunks of a new
2936          * type at runtime.  The only place where we'll remove the last
2937          * chunk of a type is the call immediately below this one.  Even
2938          * so, we're protected against races with the cleaner thread since
2939          * we're covered by the delete_unused_bgs_mutex.
2940          */
2941         btrfs_add_raid_kobjects(fs_info);
2942
2943         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2944                                                      chunk_offset);
2945         if (IS_ERR(trans)) {
2946                 ret = PTR_ERR(trans);
2947                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2948                 return ret;
2949         }
2950
2951         /*
2952          * step two, delete the device extents and the
2953          * chunk tree entries
2954          */
2955         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2956         btrfs_end_transaction(trans);
2957         return ret;
2958 }
2959
2960 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2961 {
2962         struct btrfs_root *chunk_root = fs_info->chunk_root;
2963         struct btrfs_path *path;
2964         struct extent_buffer *leaf;
2965         struct btrfs_chunk *chunk;
2966         struct btrfs_key key;
2967         struct btrfs_key found_key;
2968         u64 chunk_type;
2969         bool retried = false;
2970         int failed = 0;
2971         int ret;
2972
2973         path = btrfs_alloc_path();
2974         if (!path)
2975                 return -ENOMEM;
2976
2977 again:
2978         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2979         key.offset = (u64)-1;
2980         key.type = BTRFS_CHUNK_ITEM_KEY;
2981
2982         while (1) {
2983                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2984                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2985                 if (ret < 0) {
2986                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2987                         goto error;
2988                 }
2989                 BUG_ON(ret == 0); /* Corruption */
2990
2991                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2992                                           key.type);
2993                 if (ret)
2994                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2995                 if (ret < 0)
2996                         goto error;
2997                 if (ret > 0)
2998                         break;
2999
3000                 leaf = path->nodes[0];
3001                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3002
3003                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3004                                        struct btrfs_chunk);
3005                 chunk_type = btrfs_chunk_type(leaf, chunk);
3006                 btrfs_release_path(path);
3007
3008                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3009                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3010                         if (ret == -ENOSPC)
3011                                 failed++;
3012                         else
3013                                 BUG_ON(ret);
3014                 }
3015                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3016
3017                 if (found_key.offset == 0)
3018                         break;
3019                 key.offset = found_key.offset - 1;
3020         }
3021         ret = 0;
3022         if (failed && !retried) {
3023                 failed = 0;
3024                 retried = true;
3025                 goto again;
3026         } else if (WARN_ON(failed && retried)) {
3027                 ret = -ENOSPC;
3028         }
3029 error:
3030         btrfs_free_path(path);
3031         return ret;
3032 }
3033
3034 /*
3035  * return 1 : allocate a data chunk successfully,
3036  * return <0: errors during allocating a data chunk,
3037  * return 0 : no need to allocate a data chunk.
3038  */
3039 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3040                                       u64 chunk_offset)
3041 {
3042         struct btrfs_block_group_cache *cache;
3043         u64 bytes_used;
3044         u64 chunk_type;
3045
3046         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3047         ASSERT(cache);
3048         chunk_type = cache->flags;
3049         btrfs_put_block_group(cache);
3050
3051         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3052                 spin_lock(&fs_info->data_sinfo->lock);
3053                 bytes_used = fs_info->data_sinfo->bytes_used;
3054                 spin_unlock(&fs_info->data_sinfo->lock);
3055
3056                 if (!bytes_used) {
3057                         struct btrfs_trans_handle *trans;
3058                         int ret;
3059
3060                         trans = btrfs_join_transaction(fs_info->tree_root);
3061                         if (IS_ERR(trans))
3062                                 return PTR_ERR(trans);
3063
3064                         ret = btrfs_force_chunk_alloc(trans, fs_info,
3065                                                       BTRFS_BLOCK_GROUP_DATA);
3066                         btrfs_end_transaction(trans);
3067                         if (ret < 0)
3068                                 return ret;
3069
3070                         btrfs_add_raid_kobjects(fs_info);
3071
3072                         return 1;
3073                 }
3074         }
3075         return 0;
3076 }
3077
3078 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3079                                struct btrfs_balance_control *bctl)
3080 {
3081         struct btrfs_root *root = fs_info->tree_root;
3082         struct btrfs_trans_handle *trans;
3083         struct btrfs_balance_item *item;
3084         struct btrfs_disk_balance_args disk_bargs;
3085         struct btrfs_path *path;
3086         struct extent_buffer *leaf;
3087         struct btrfs_key key;
3088         int ret, err;
3089
3090         path = btrfs_alloc_path();
3091         if (!path)
3092                 return -ENOMEM;
3093
3094         trans = btrfs_start_transaction(root, 0);
3095         if (IS_ERR(trans)) {
3096                 btrfs_free_path(path);
3097                 return PTR_ERR(trans);
3098         }
3099
3100         key.objectid = BTRFS_BALANCE_OBJECTID;
3101         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3102         key.offset = 0;
3103
3104         ret = btrfs_insert_empty_item(trans, root, path, &key,
3105                                       sizeof(*item));
3106         if (ret)
3107                 goto out;
3108
3109         leaf = path->nodes[0];
3110         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3111
3112         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3113
3114         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3115         btrfs_set_balance_data(leaf, item, &disk_bargs);
3116         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3117         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3118         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3119         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3120
3121         btrfs_set_balance_flags(leaf, item, bctl->flags);
3122
3123         btrfs_mark_buffer_dirty(leaf);
3124 out:
3125         btrfs_free_path(path);
3126         err = btrfs_commit_transaction(trans);
3127         if (err && !ret)
3128                 ret = err;
3129         return ret;
3130 }
3131
3132 static int del_balance_item(struct btrfs_fs_info *fs_info)
3133 {
3134         struct btrfs_root *root = fs_info->tree_root;
3135         struct btrfs_trans_handle *trans;
3136         struct btrfs_path *path;
3137         struct btrfs_key key;
3138         int ret, err;
3139
3140         path = btrfs_alloc_path();
3141         if (!path)
3142                 return -ENOMEM;
3143
3144         trans = btrfs_start_transaction(root, 0);
3145         if (IS_ERR(trans)) {
3146                 btrfs_free_path(path);
3147                 return PTR_ERR(trans);
3148         }
3149
3150         key.objectid = BTRFS_BALANCE_OBJECTID;
3151         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3152         key.offset = 0;
3153
3154         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3155         if (ret < 0)
3156                 goto out;
3157         if (ret > 0) {
3158                 ret = -ENOENT;
3159                 goto out;
3160         }
3161
3162         ret = btrfs_del_item(trans, root, path);
3163 out:
3164         btrfs_free_path(path);
3165         err = btrfs_commit_transaction(trans);
3166         if (err && !ret)
3167                 ret = err;
3168         return ret;
3169 }
3170
3171 /*
3172  * This is a heuristic used to reduce the number of chunks balanced on
3173  * resume after balance was interrupted.
3174  */
3175 static void update_balance_args(struct btrfs_balance_control *bctl)
3176 {
3177         /*
3178          * Turn on soft mode for chunk types that were being converted.
3179          */
3180         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3181                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3182         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3183                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3184         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3185                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3186
3187         /*
3188          * Turn on usage filter if is not already used.  The idea is
3189          * that chunks that we have already balanced should be
3190          * reasonably full.  Don't do it for chunks that are being
3191          * converted - that will keep us from relocating unconverted
3192          * (albeit full) chunks.
3193          */
3194         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3195             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3196             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3197                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3198                 bctl->data.usage = 90;
3199         }
3200         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3201             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3202             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3203                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3204                 bctl->sys.usage = 90;
3205         }
3206         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3207             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3208             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3209                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3210                 bctl->meta.usage = 90;
3211         }
3212 }
3213
3214 /*
3215  * Clear the balance status in fs_info and delete the balance item from disk.
3216  */
3217 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3218 {
3219         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3220         int ret;
3221
3222         BUG_ON(!fs_info->balance_ctl);
3223
3224         spin_lock(&fs_info->balance_lock);
3225         fs_info->balance_ctl = NULL;
3226         spin_unlock(&fs_info->balance_lock);
3227
3228         kfree(bctl);
3229         ret = del_balance_item(fs_info);
3230         if (ret)
3231                 btrfs_handle_fs_error(fs_info, ret, NULL);
3232 }
3233
3234 /*
3235  * Balance filters.  Return 1 if chunk should be filtered out
3236  * (should not be balanced).
3237  */
3238 static int chunk_profiles_filter(u64 chunk_type,
3239                                  struct btrfs_balance_args *bargs)
3240 {
3241         chunk_type = chunk_to_extended(chunk_type) &
3242                                 BTRFS_EXTENDED_PROFILE_MASK;
3243
3244         if (bargs->profiles & chunk_type)
3245                 return 0;