btrfs: move btrfs_raid_type_names values to btrfs_raid_attr table
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/iocontext.h>
12 #include <linux/capability.h>
13 #include <linux/ratelimit.h>
14 #include <linux/kthread.h>
15 #include <linux/raid/pq.h>
16 #include <linux/semaphore.h>
17 #include <linux/uuid.h>
18 #include <linux/list_sort.h>
19 #include <asm/div64.h>
20 #include "ctree.h"
21 #include "extent_map.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "volumes.h"
26 #include "raid56.h"
27 #include "async-thread.h"
28 #include "check-integrity.h"
29 #include "rcu-string.h"
30 #include "math.h"
31 #include "dev-replace.h"
32 #include "sysfs.h"
33
34 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
35         [BTRFS_RAID_RAID10] = {
36                 .sub_stripes    = 2,
37                 .dev_stripes    = 1,
38                 .devs_max       = 0,    /* 0 == as many as possible */
39                 .devs_min       = 4,
40                 .tolerated_failures = 1,
41                 .devs_increment = 2,
42                 .ncopies        = 2,
43                 .raid_name      = "raid10",
44         },
45         [BTRFS_RAID_RAID1] = {
46                 .sub_stripes    = 1,
47                 .dev_stripes    = 1,
48                 .devs_max       = 2,
49                 .devs_min       = 2,
50                 .tolerated_failures = 1,
51                 .devs_increment = 2,
52                 .ncopies        = 2,
53                 .raid_name      = "raid1",
54         },
55         [BTRFS_RAID_DUP] = {
56                 .sub_stripes    = 1,
57                 .dev_stripes    = 2,
58                 .devs_max       = 1,
59                 .devs_min       = 1,
60                 .tolerated_failures = 0,
61                 .devs_increment = 1,
62                 .ncopies        = 2,
63                 .raid_name      = "dup",
64         },
65         [BTRFS_RAID_RAID0] = {
66                 .sub_stripes    = 1,
67                 .dev_stripes    = 1,
68                 .devs_max       = 0,
69                 .devs_min       = 2,
70                 .tolerated_failures = 0,
71                 .devs_increment = 1,
72                 .ncopies        = 1,
73                 .raid_name      = "raid0",
74         },
75         [BTRFS_RAID_SINGLE] = {
76                 .sub_stripes    = 1,
77                 .dev_stripes    = 1,
78                 .devs_max       = 1,
79                 .devs_min       = 1,
80                 .tolerated_failures = 0,
81                 .devs_increment = 1,
82                 .ncopies        = 1,
83                 .raid_name      = "single",
84         },
85         [BTRFS_RAID_RAID5] = {
86                 .sub_stripes    = 1,
87                 .dev_stripes    = 1,
88                 .devs_max       = 0,
89                 .devs_min       = 2,
90                 .tolerated_failures = 1,
91                 .devs_increment = 1,
92                 .ncopies        = 2,
93                 .raid_name      = "raid5",
94         },
95         [BTRFS_RAID_RAID6] = {
96                 .sub_stripes    = 1,
97                 .dev_stripes    = 1,
98                 .devs_max       = 0,
99                 .devs_min       = 3,
100                 .tolerated_failures = 2,
101                 .devs_increment = 1,
102                 .ncopies        = 3,
103                 .raid_name      = "raid6",
104         },
105 };
106
107 const char *get_raid_name(enum btrfs_raid_types type)
108 {
109         if (type >= BTRFS_NR_RAID_TYPES)
110                 return NULL;
111
112         return btrfs_raid_array[type].raid_name;
113 }
114
115 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
116         [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
117         [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
118         [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
119         [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
120         [BTRFS_RAID_SINGLE] = 0,
121         [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
122         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
123 };
124
125 /*
126  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
127  * condition is not met. Zero means there's no corresponding
128  * BTRFS_ERROR_DEV_*_NOT_MET value.
129  */
130 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
131         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
132         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
133         [BTRFS_RAID_DUP]    = 0,
134         [BTRFS_RAID_RAID0]  = 0,
135         [BTRFS_RAID_SINGLE] = 0,
136         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
137         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
138 };
139
140 static int init_first_rw_device(struct btrfs_trans_handle *trans,
141                                 struct btrfs_fs_info *fs_info);
142 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
143 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
144 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
145 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
146 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
147                              enum btrfs_map_op op,
148                              u64 logical, u64 *length,
149                              struct btrfs_bio **bbio_ret,
150                              int mirror_num, int need_raid_map);
151
152 /*
153  * Device locking
154  * ==============
155  *
156  * There are several mutexes that protect manipulation of devices and low-level
157  * structures like chunks but not block groups, extents or files
158  *
159  * uuid_mutex (global lock)
160  * ------------------------
161  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
162  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
163  * device) or requested by the device= mount option
164  *
165  * the mutex can be very coarse and can cover long-running operations
166  *
167  * protects: updates to fs_devices counters like missing devices, rw devices,
168  * seeding, structure cloning, openning/closing devices at mount/umount time
169  *
170  * global::fs_devs - add, remove, updates to the global list
171  *
172  * does not protect: manipulation of the fs_devices::devices list!
173  *
174  * btrfs_device::name - renames (write side), read is RCU
175  *
176  * fs_devices::device_list_mutex (per-fs, with RCU)
177  * ------------------------------------------------
178  * protects updates to fs_devices::devices, ie. adding and deleting
179  *
180  * simple list traversal with read-only actions can be done with RCU protection
181  *
182  * may be used to exclude some operations from running concurrently without any
183  * modifications to the list (see write_all_supers)
184  *
185  * balance_mutex
186  * -------------
187  * protects balance structures (status, state) and context accessed from
188  * several places (internally, ioctl)
189  *
190  * chunk_mutex
191  * -----------
192  * protects chunks, adding or removing during allocation, trim or when a new
193  * device is added/removed
194  *
195  * cleaner_mutex
196  * -------------
197  * a big lock that is held by the cleaner thread and prevents running subvolume
198  * cleaning together with relocation or delayed iputs
199  *
200  *
201  * Lock nesting
202  * ============
203  *
204  * uuid_mutex
205  *   volume_mutex
206  *     device_list_mutex
207  *       chunk_mutex
208  *     balance_mutex
209  *
210  *
211  * Exclusive operations, BTRFS_FS_EXCL_OP
212  * ======================================
213  *
214  * Maintains the exclusivity of the following operations that apply to the
215  * whole filesystem and cannot run in parallel.
216  *
217  * - Balance (*)
218  * - Device add
219  * - Device remove
220  * - Device replace (*)
221  * - Resize
222  *
223  * The device operations (as above) can be in one of the following states:
224  *
225  * - Running state
226  * - Paused state
227  * - Completed state
228  *
229  * Only device operations marked with (*) can go into the Paused state for the
230  * following reasons:
231  *
232  * - ioctl (only Balance can be Paused through ioctl)
233  * - filesystem remounted as read-only
234  * - filesystem unmounted and mounted as read-only
235  * - system power-cycle and filesystem mounted as read-only
236  * - filesystem or device errors leading to forced read-only
237  *
238  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
239  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
240  * A device operation in Paused or Running state can be canceled or resumed
241  * either by ioctl (Balance only) or when remounted as read-write.
242  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
243  * completed.
244  */
245
246 DEFINE_MUTEX(uuid_mutex);
247 static LIST_HEAD(fs_uuids);
248 struct list_head *btrfs_get_fs_uuids(void)
249 {
250         return &fs_uuids;
251 }
252
253 /*
254  * alloc_fs_devices - allocate struct btrfs_fs_devices
255  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
256  *
257  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
258  * The returned struct is not linked onto any lists and can be destroyed with
259  * kfree() right away.
260  */
261 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
262 {
263         struct btrfs_fs_devices *fs_devs;
264
265         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
266         if (!fs_devs)
267                 return ERR_PTR(-ENOMEM);
268
269         mutex_init(&fs_devs->device_list_mutex);
270
271         INIT_LIST_HEAD(&fs_devs->devices);
272         INIT_LIST_HEAD(&fs_devs->resized_devices);
273         INIT_LIST_HEAD(&fs_devs->alloc_list);
274         INIT_LIST_HEAD(&fs_devs->fs_list);
275         if (fsid)
276                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
277
278         return fs_devs;
279 }
280
281 void btrfs_free_device(struct btrfs_device *device)
282 {
283         rcu_string_free(device->name);
284         bio_put(device->flush_bio);
285         kfree(device);
286 }
287
288 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
289 {
290         struct btrfs_device *device;
291         WARN_ON(fs_devices->opened);
292         while (!list_empty(&fs_devices->devices)) {
293                 device = list_entry(fs_devices->devices.next,
294                                     struct btrfs_device, dev_list);
295                 list_del(&device->dev_list);
296                 btrfs_free_device(device);
297         }
298         kfree(fs_devices);
299 }
300
301 static void btrfs_kobject_uevent(struct block_device *bdev,
302                                  enum kobject_action action)
303 {
304         int ret;
305
306         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
307         if (ret)
308                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
309                         action,
310                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
311                         &disk_to_dev(bdev->bd_disk)->kobj);
312 }
313
314 void __exit btrfs_cleanup_fs_uuids(void)
315 {
316         struct btrfs_fs_devices *fs_devices;
317
318         while (!list_empty(&fs_uuids)) {
319                 fs_devices = list_entry(fs_uuids.next,
320                                         struct btrfs_fs_devices, fs_list);
321                 list_del(&fs_devices->fs_list);
322                 free_fs_devices(fs_devices);
323         }
324 }
325
326 /*
327  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
328  * Returned struct is not linked onto any lists and must be destroyed using
329  * btrfs_free_device.
330  */
331 static struct btrfs_device *__alloc_device(void)
332 {
333         struct btrfs_device *dev;
334
335         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
336         if (!dev)
337                 return ERR_PTR(-ENOMEM);
338
339         /*
340          * Preallocate a bio that's always going to be used for flushing device
341          * barriers and matches the device lifespan
342          */
343         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
344         if (!dev->flush_bio) {
345                 kfree(dev);
346                 return ERR_PTR(-ENOMEM);
347         }
348
349         INIT_LIST_HEAD(&dev->dev_list);
350         INIT_LIST_HEAD(&dev->dev_alloc_list);
351         INIT_LIST_HEAD(&dev->resized_list);
352
353         spin_lock_init(&dev->io_lock);
354
355         atomic_set(&dev->reada_in_flight, 0);
356         atomic_set(&dev->dev_stats_ccnt, 0);
357         btrfs_device_data_ordered_init(dev);
358         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
359         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
360
361         return dev;
362 }
363
364 /*
365  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
366  * return NULL.
367  *
368  * If devid and uuid are both specified, the match must be exact, otherwise
369  * only devid is used.
370  */
371 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
372                 u64 devid, const u8 *uuid)
373 {
374         struct btrfs_device *dev;
375
376         list_for_each_entry(dev, &fs_devices->devices, dev_list) {
377                 if (dev->devid == devid &&
378                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
379                         return dev;
380                 }
381         }
382         return NULL;
383 }
384
385 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
386 {
387         struct btrfs_fs_devices *fs_devices;
388
389         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
390                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
391                         return fs_devices;
392         }
393         return NULL;
394 }
395
396 static int
397 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
398                       int flush, struct block_device **bdev,
399                       struct buffer_head **bh)
400 {
401         int ret;
402
403         *bdev = blkdev_get_by_path(device_path, flags, holder);
404
405         if (IS_ERR(*bdev)) {
406                 ret = PTR_ERR(*bdev);
407                 goto error;
408         }
409
410         if (flush)
411                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
412         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
413         if (ret) {
414                 blkdev_put(*bdev, flags);
415                 goto error;
416         }
417         invalidate_bdev(*bdev);
418         *bh = btrfs_read_dev_super(*bdev);
419         if (IS_ERR(*bh)) {
420                 ret = PTR_ERR(*bh);
421                 blkdev_put(*bdev, flags);
422                 goto error;
423         }
424
425         return 0;
426
427 error:
428         *bdev = NULL;
429         *bh = NULL;
430         return ret;
431 }
432
433 static void requeue_list(struct btrfs_pending_bios *pending_bios,
434                         struct bio *head, struct bio *tail)
435 {
436
437         struct bio *old_head;
438
439         old_head = pending_bios->head;
440         pending_bios->head = head;
441         if (pending_bios->tail)
442                 tail->bi_next = old_head;
443         else
444                 pending_bios->tail = tail;
445 }
446
447 /*
448  * we try to collect pending bios for a device so we don't get a large
449  * number of procs sending bios down to the same device.  This greatly
450  * improves the schedulers ability to collect and merge the bios.
451  *
452  * But, it also turns into a long list of bios to process and that is sure
453  * to eventually make the worker thread block.  The solution here is to
454  * make some progress and then put this work struct back at the end of
455  * the list if the block device is congested.  This way, multiple devices
456  * can make progress from a single worker thread.
457  */
458 static noinline void run_scheduled_bios(struct btrfs_device *device)
459 {
460         struct btrfs_fs_info *fs_info = device->fs_info;
461         struct bio *pending;
462         struct backing_dev_info *bdi;
463         struct btrfs_pending_bios *pending_bios;
464         struct bio *tail;
465         struct bio *cur;
466         int again = 0;
467         unsigned long num_run;
468         unsigned long batch_run = 0;
469         unsigned long last_waited = 0;
470         int force_reg = 0;
471         int sync_pending = 0;
472         struct blk_plug plug;
473
474         /*
475          * this function runs all the bios we've collected for
476          * a particular device.  We don't want to wander off to
477          * another device without first sending all of these down.
478          * So, setup a plug here and finish it off before we return
479          */
480         blk_start_plug(&plug);
481
482         bdi = device->bdev->bd_bdi;
483
484 loop:
485         spin_lock(&device->io_lock);
486
487 loop_lock:
488         num_run = 0;
489
490         /* take all the bios off the list at once and process them
491          * later on (without the lock held).  But, remember the
492          * tail and other pointers so the bios can be properly reinserted
493          * into the list if we hit congestion
494          */
495         if (!force_reg && device->pending_sync_bios.head) {
496                 pending_bios = &device->pending_sync_bios;
497                 force_reg = 1;
498         } else {
499                 pending_bios = &device->pending_bios;
500                 force_reg = 0;
501         }
502
503         pending = pending_bios->head;
504         tail = pending_bios->tail;
505         WARN_ON(pending && !tail);
506
507         /*
508          * if pending was null this time around, no bios need processing
509          * at all and we can stop.  Otherwise it'll loop back up again
510          * and do an additional check so no bios are missed.
511          *
512          * device->running_pending is used to synchronize with the
513          * schedule_bio code.
514          */
515         if (device->pending_sync_bios.head == NULL &&
516             device->pending_bios.head == NULL) {
517                 again = 0;
518                 device->running_pending = 0;
519         } else {
520                 again = 1;
521                 device->running_pending = 1;
522         }
523
524         pending_bios->head = NULL;
525         pending_bios->tail = NULL;
526
527         spin_unlock(&device->io_lock);
528
529         while (pending) {
530
531                 rmb();
532                 /* we want to work on both lists, but do more bios on the
533                  * sync list than the regular list
534                  */
535                 if ((num_run > 32 &&
536                     pending_bios != &device->pending_sync_bios &&
537                     device->pending_sync_bios.head) ||
538                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
539                     device->pending_bios.head)) {
540                         spin_lock(&device->io_lock);
541                         requeue_list(pending_bios, pending, tail);
542                         goto loop_lock;
543                 }
544
545                 cur = pending;
546                 pending = pending->bi_next;
547                 cur->bi_next = NULL;
548
549                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
550
551                 /*
552                  * if we're doing the sync list, record that our
553                  * plug has some sync requests on it
554                  *
555                  * If we're doing the regular list and there are
556                  * sync requests sitting around, unplug before
557                  * we add more
558                  */
559                 if (pending_bios == &device->pending_sync_bios) {
560                         sync_pending = 1;
561                 } else if (sync_pending) {
562                         blk_finish_plug(&plug);
563                         blk_start_plug(&plug);
564                         sync_pending = 0;
565                 }
566
567                 btrfsic_submit_bio(cur);
568                 num_run++;
569                 batch_run++;
570
571                 cond_resched();
572
573                 /*
574                  * we made progress, there is more work to do and the bdi
575                  * is now congested.  Back off and let other work structs
576                  * run instead
577                  */
578                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
579                     fs_info->fs_devices->open_devices > 1) {
580                         struct io_context *ioc;
581
582                         ioc = current->io_context;
583
584                         /*
585                          * the main goal here is that we don't want to
586                          * block if we're going to be able to submit
587                          * more requests without blocking.
588                          *
589                          * This code does two great things, it pokes into
590                          * the elevator code from a filesystem _and_
591                          * it makes assumptions about how batching works.
592                          */
593                         if (ioc && ioc->nr_batch_requests > 0 &&
594                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
595                             (last_waited == 0 ||
596                              ioc->last_waited == last_waited)) {
597                                 /*
598                                  * we want to go through our batch of
599                                  * requests and stop.  So, we copy out
600                                  * the ioc->last_waited time and test
601                                  * against it before looping
602                                  */
603                                 last_waited = ioc->last_waited;
604                                 cond_resched();
605                                 continue;
606                         }
607                         spin_lock(&device->io_lock);
608                         requeue_list(pending_bios, pending, tail);
609                         device->running_pending = 1;
610
611                         spin_unlock(&device->io_lock);
612                         btrfs_queue_work(fs_info->submit_workers,
613                                          &device->work);
614                         goto done;
615                 }
616         }
617
618         cond_resched();
619         if (again)
620                 goto loop;
621
622         spin_lock(&device->io_lock);
623         if (device->pending_bios.head || device->pending_sync_bios.head)
624                 goto loop_lock;
625         spin_unlock(&device->io_lock);
626
627 done:
628         blk_finish_plug(&plug);
629 }
630
631 static void pending_bios_fn(struct btrfs_work *work)
632 {
633         struct btrfs_device *device;
634
635         device = container_of(work, struct btrfs_device, work);
636         run_scheduled_bios(device);
637 }
638
639 /*
640  *  Search and remove all stale (devices which are not mounted) devices.
641  *  When both inputs are NULL, it will search and release all stale devices.
642  *  path:       Optional. When provided will it release all unmounted devices
643  *              matching this path only.
644  *  skip_dev:   Optional. Will skip this device when searching for the stale
645  *              devices.
646  */
647 static void btrfs_free_stale_devices(const char *path,
648                                      struct btrfs_device *skip_dev)
649 {
650         struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
651         struct btrfs_device *dev, *tmp_dev;
652
653         list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
654
655                 if (fs_devs->opened)
656                         continue;
657
658                 list_for_each_entry_safe(dev, tmp_dev,
659                                          &fs_devs->devices, dev_list) {
660                         int not_found = 0;
661
662                         if (skip_dev && skip_dev == dev)
663                                 continue;
664                         if (path && !dev->name)
665                                 continue;
666
667                         rcu_read_lock();
668                         if (path)
669                                 not_found = strcmp(rcu_str_deref(dev->name),
670                                                    path);
671                         rcu_read_unlock();
672                         if (not_found)
673                                 continue;
674
675                         /* delete the stale device */
676                         if (fs_devs->num_devices == 1) {
677                                 btrfs_sysfs_remove_fsid(fs_devs);
678                                 list_del(&fs_devs->fs_list);
679                                 free_fs_devices(fs_devs);
680                                 break;
681                         } else {
682                                 fs_devs->num_devices--;
683                                 list_del(&dev->dev_list);
684                                 btrfs_free_device(dev);
685                         }
686                 }
687         }
688 }
689
690 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
691                         struct btrfs_device *device, fmode_t flags,
692                         void *holder)
693 {
694         struct request_queue *q;
695         struct block_device *bdev;
696         struct buffer_head *bh;
697         struct btrfs_super_block *disk_super;
698         u64 devid;
699         int ret;
700
701         if (device->bdev)
702                 return -EINVAL;
703         if (!device->name)
704                 return -EINVAL;
705
706         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
707                                     &bdev, &bh);
708         if (ret)
709                 return ret;
710
711         disk_super = (struct btrfs_super_block *)bh->b_data;
712         devid = btrfs_stack_device_id(&disk_super->dev_item);
713         if (devid != device->devid)
714                 goto error_brelse;
715
716         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
717                 goto error_brelse;
718
719         device->generation = btrfs_super_generation(disk_super);
720
721         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
722                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
723                 fs_devices->seeding = 1;
724         } else {
725                 if (bdev_read_only(bdev))
726                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
727                 else
728                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
729         }
730
731         q = bdev_get_queue(bdev);
732         if (!blk_queue_nonrot(q))
733                 fs_devices->rotating = 1;
734
735         device->bdev = bdev;
736         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
737         device->mode = flags;
738
739         fs_devices->open_devices++;
740         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
741             device->devid != BTRFS_DEV_REPLACE_DEVID) {
742                 fs_devices->rw_devices++;
743                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
744         }
745         brelse(bh);
746
747         return 0;
748
749 error_brelse:
750         brelse(bh);
751         blkdev_put(bdev, flags);
752
753         return -EINVAL;
754 }
755
756 /*
757  * Add new device to list of registered devices
758  *
759  * Returns:
760  * device pointer which was just added or updated when successful
761  * error pointer when failed
762  */
763 static noinline struct btrfs_device *device_list_add(const char *path,
764                            struct btrfs_super_block *disk_super)
765 {
766         struct btrfs_device *device;
767         struct btrfs_fs_devices *fs_devices;
768         struct rcu_string *name;
769         u64 found_transid = btrfs_super_generation(disk_super);
770         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
771
772         fs_devices = find_fsid(disk_super->fsid);
773         if (!fs_devices) {
774                 fs_devices = alloc_fs_devices(disk_super->fsid);
775                 if (IS_ERR(fs_devices))
776                         return ERR_CAST(fs_devices);
777
778                 list_add(&fs_devices->fs_list, &fs_uuids);
779
780                 device = NULL;
781         } else {
782                 device = find_device(fs_devices, devid,
783                                 disk_super->dev_item.uuid);
784         }
785
786         if (!device) {
787                 if (fs_devices->opened)
788                         return ERR_PTR(-EBUSY);
789
790                 device = btrfs_alloc_device(NULL, &devid,
791                                             disk_super->dev_item.uuid);
792                 if (IS_ERR(device)) {
793                         /* we can safely leave the fs_devices entry around */
794                         return device;
795                 }
796
797                 name = rcu_string_strdup(path, GFP_NOFS);
798                 if (!name) {
799                         btrfs_free_device(device);
800                         return ERR_PTR(-ENOMEM);
801                 }
802                 rcu_assign_pointer(device->name, name);
803
804                 mutex_lock(&fs_devices->device_list_mutex);
805                 list_add_rcu(&device->dev_list, &fs_devices->devices);
806                 fs_devices->num_devices++;
807                 mutex_unlock(&fs_devices->device_list_mutex);
808
809                 device->fs_devices = fs_devices;
810                 btrfs_free_stale_devices(path, device);
811
812                 if (disk_super->label[0])
813                         pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
814                                 disk_super->label, devid, found_transid, path);
815                 else
816                         pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
817                                 disk_super->fsid, devid, found_transid, path);
818
819         } else if (!device->name || strcmp(device->name->str, path)) {
820                 /*
821                  * When FS is already mounted.
822                  * 1. If you are here and if the device->name is NULL that
823                  *    means this device was missing at time of FS mount.
824                  * 2. If you are here and if the device->name is different
825                  *    from 'path' that means either
826                  *      a. The same device disappeared and reappeared with
827                  *         different name. or
828                  *      b. The missing-disk-which-was-replaced, has
829                  *         reappeared now.
830                  *
831                  * We must allow 1 and 2a above. But 2b would be a spurious
832                  * and unintentional.
833                  *
834                  * Further in case of 1 and 2a above, the disk at 'path'
835                  * would have missed some transaction when it was away and
836                  * in case of 2a the stale bdev has to be updated as well.
837                  * 2b must not be allowed at all time.
838                  */
839
840                 /*
841                  * For now, we do allow update to btrfs_fs_device through the
842                  * btrfs dev scan cli after FS has been mounted.  We're still
843                  * tracking a problem where systems fail mount by subvolume id
844                  * when we reject replacement on a mounted FS.
845                  */
846                 if (!fs_devices->opened && found_transid < device->generation) {
847                         /*
848                          * That is if the FS is _not_ mounted and if you
849                          * are here, that means there is more than one
850                          * disk with same uuid and devid.We keep the one
851                          * with larger generation number or the last-in if
852                          * generation are equal.
853                          */
854                         return ERR_PTR(-EEXIST);
855                 }
856
857                 name = rcu_string_strdup(path, GFP_NOFS);
858                 if (!name)
859                         return ERR_PTR(-ENOMEM);
860                 rcu_string_free(device->name);
861                 rcu_assign_pointer(device->name, name);
862                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
863                         fs_devices->missing_devices--;
864                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
865                 }
866         }
867
868         /*
869          * Unmount does not free the btrfs_device struct but would zero
870          * generation along with most of the other members. So just update
871          * it back. We need it to pick the disk with largest generation
872          * (as above).
873          */
874         if (!fs_devices->opened)
875                 device->generation = found_transid;
876
877         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
878
879         return device;
880 }
881
882 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
883 {
884         struct btrfs_fs_devices *fs_devices;
885         struct btrfs_device *device;
886         struct btrfs_device *orig_dev;
887
888         fs_devices = alloc_fs_devices(orig->fsid);
889         if (IS_ERR(fs_devices))
890                 return fs_devices;
891
892         mutex_lock(&orig->device_list_mutex);
893         fs_devices->total_devices = orig->total_devices;
894
895         /* We have held the volume lock, it is safe to get the devices. */
896         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
897                 struct rcu_string *name;
898
899                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
900                                             orig_dev->uuid);
901                 if (IS_ERR(device))
902                         goto error;
903
904                 /*
905                  * This is ok to do without rcu read locked because we hold the
906                  * uuid mutex so nothing we touch in here is going to disappear.
907                  */
908                 if (orig_dev->name) {
909                         name = rcu_string_strdup(orig_dev->name->str,
910                                         GFP_KERNEL);
911                         if (!name) {
912                                 btrfs_free_device(device);
913                                 goto error;
914                         }
915                         rcu_assign_pointer(device->name, name);
916                 }
917
918                 list_add(&device->dev_list, &fs_devices->devices);
919                 device->fs_devices = fs_devices;
920                 fs_devices->num_devices++;
921         }
922         mutex_unlock(&orig->device_list_mutex);
923         return fs_devices;
924 error:
925         mutex_unlock(&orig->device_list_mutex);
926         free_fs_devices(fs_devices);
927         return ERR_PTR(-ENOMEM);
928 }
929
930 /*
931  * After we have read the system tree and know devids belonging to
932  * this filesystem, remove the device which does not belong there.
933  */
934 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
935 {
936         struct btrfs_device *device, *next;
937         struct btrfs_device *latest_dev = NULL;
938
939         mutex_lock(&uuid_mutex);
940 again:
941         /* This is the initialized path, it is safe to release the devices. */
942         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
943                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
944                                                         &device->dev_state)) {
945                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
946                              &device->dev_state) &&
947                              (!latest_dev ||
948                               device->generation > latest_dev->generation)) {
949                                 latest_dev = device;
950                         }
951                         continue;
952                 }
953
954                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
955                         /*
956                          * In the first step, keep the device which has
957                          * the correct fsid and the devid that is used
958                          * for the dev_replace procedure.
959                          * In the second step, the dev_replace state is
960                          * read from the device tree and it is known
961                          * whether the procedure is really active or
962                          * not, which means whether this device is
963                          * used or whether it should be removed.
964                          */
965                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
966                                                   &device->dev_state)) {
967                                 continue;
968                         }
969                 }
970                 if (device->bdev) {
971                         blkdev_put(device->bdev, device->mode);
972                         device->bdev = NULL;
973                         fs_devices->open_devices--;
974                 }
975                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
976                         list_del_init(&device->dev_alloc_list);
977                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
978                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
979                                       &device->dev_state))
980                                 fs_devices->rw_devices--;
981                 }
982                 list_del_init(&device->dev_list);
983                 fs_devices->num_devices--;
984                 btrfs_free_device(device);
985         }
986
987         if (fs_devices->seed) {
988                 fs_devices = fs_devices->seed;
989                 goto again;
990         }
991
992         fs_devices->latest_bdev = latest_dev->bdev;
993
994         mutex_unlock(&uuid_mutex);
995 }
996
997 static void free_device_rcu(struct rcu_head *head)
998 {
999         struct btrfs_device *device;
1000
1001         device = container_of(head, struct btrfs_device, rcu);
1002         btrfs_free_device(device);
1003 }
1004
1005 static void btrfs_close_bdev(struct btrfs_device *device)
1006 {
1007         if (!device->bdev)
1008                 return;
1009
1010         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1011                 sync_blockdev(device->bdev);
1012                 invalidate_bdev(device->bdev);
1013         }
1014
1015         blkdev_put(device->bdev, device->mode);
1016 }
1017
1018 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1019 {
1020         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1021         struct btrfs_device *new_device;
1022         struct rcu_string *name;
1023
1024         if (device->bdev)
1025                 fs_devices->open_devices--;
1026
1027         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1028             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1029                 list_del_init(&device->dev_alloc_list);
1030                 fs_devices->rw_devices--;
1031         }
1032
1033         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1034                 fs_devices->missing_devices--;
1035
1036         new_device = btrfs_alloc_device(NULL, &device->devid,
1037                                         device->uuid);
1038         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1039
1040         /* Safe because we are under uuid_mutex */
1041         if (device->name) {
1042                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
1043                 BUG_ON(!name); /* -ENOMEM */
1044                 rcu_assign_pointer(new_device->name, name);
1045         }
1046
1047         list_replace_rcu(&device->dev_list, &new_device->dev_list);
1048         new_device->fs_devices = device->fs_devices;
1049 }
1050
1051 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1052 {
1053         struct btrfs_device *device, *tmp;
1054         struct list_head pending_put;
1055
1056         INIT_LIST_HEAD(&pending_put);
1057
1058         if (--fs_devices->opened > 0)
1059                 return 0;
1060
1061         mutex_lock(&fs_devices->device_list_mutex);
1062         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1063                 btrfs_prepare_close_one_device(device);
1064                 list_add(&device->dev_list, &pending_put);
1065         }
1066         mutex_unlock(&fs_devices->device_list_mutex);
1067
1068         /*
1069          * btrfs_show_devname() is using the device_list_mutex,
1070          * sometimes call to blkdev_put() leads vfs calling
1071          * into this func. So do put outside of device_list_mutex,
1072          * as of now.
1073          */
1074         while (!list_empty(&pending_put)) {
1075                 device = list_first_entry(&pending_put,
1076                                 struct btrfs_device, dev_list);
1077                 list_del(&device->dev_list);
1078                 btrfs_close_bdev(device);
1079                 call_rcu(&device->rcu, free_device_rcu);
1080         }
1081
1082         WARN_ON(fs_devices->open_devices);
1083         WARN_ON(fs_devices->rw_devices);
1084         fs_devices->opened = 0;
1085         fs_devices->seeding = 0;
1086
1087         return 0;
1088 }
1089
1090 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1091 {
1092         struct btrfs_fs_devices *seed_devices = NULL;
1093         int ret;
1094
1095         mutex_lock(&uuid_mutex);
1096         ret = close_fs_devices(fs_devices);
1097         if (!fs_devices->opened) {
1098                 seed_devices = fs_devices->seed;
1099                 fs_devices->seed = NULL;
1100         }
1101         mutex_unlock(&uuid_mutex);
1102
1103         while (seed_devices) {
1104                 fs_devices = seed_devices;
1105                 seed_devices = fs_devices->seed;
1106                 close_fs_devices(fs_devices);
1107                 free_fs_devices(fs_devices);
1108         }
1109         return ret;
1110 }
1111
1112 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1113                                 fmode_t flags, void *holder)
1114 {
1115         struct btrfs_device *device;
1116         struct btrfs_device *latest_dev = NULL;
1117         int ret = 0;
1118
1119         flags |= FMODE_EXCL;
1120
1121         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1122                 /* Just open everything we can; ignore failures here */
1123                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1124                         continue;
1125
1126                 if (!latest_dev ||
1127                     device->generation > latest_dev->generation)
1128                         latest_dev = device;
1129         }
1130         if (fs_devices->open_devices == 0) {
1131                 ret = -EINVAL;
1132                 goto out;
1133         }
1134         fs_devices->opened = 1;
1135         fs_devices->latest_bdev = latest_dev->bdev;
1136         fs_devices->total_rw_bytes = 0;
1137 out:
1138         return ret;
1139 }
1140
1141 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1142 {
1143         struct btrfs_device *dev1, *dev2;
1144
1145         dev1 = list_entry(a, struct btrfs_device, dev_list);
1146         dev2 = list_entry(b, struct btrfs_device, dev_list);
1147
1148         if (dev1->devid < dev2->devid)
1149                 return -1;
1150         else if (dev1->devid > dev2->devid)
1151                 return 1;
1152         return 0;
1153 }
1154
1155 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1156                        fmode_t flags, void *holder)
1157 {
1158         int ret;
1159
1160         mutex_lock(&uuid_mutex);
1161         if (fs_devices->opened) {
1162                 fs_devices->opened++;
1163                 ret = 0;
1164         } else {
1165                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1166                 ret = open_fs_devices(fs_devices, flags, holder);
1167         }
1168         mutex_unlock(&uuid_mutex);
1169         return ret;
1170 }
1171
1172 static void btrfs_release_disk_super(struct page *page)
1173 {
1174         kunmap(page);
1175         put_page(page);
1176 }
1177
1178 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1179                                  struct page **page,
1180                                  struct btrfs_super_block **disk_super)
1181 {
1182         void *p;
1183         pgoff_t index;
1184
1185         /* make sure our super fits in the device */
1186         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1187                 return 1;
1188
1189         /* make sure our super fits in the page */
1190         if (sizeof(**disk_super) > PAGE_SIZE)
1191                 return 1;
1192
1193         /* make sure our super doesn't straddle pages on disk */
1194         index = bytenr >> PAGE_SHIFT;
1195         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1196                 return 1;
1197
1198         /* pull in the page with our super */
1199         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1200                                    index, GFP_KERNEL);
1201
1202         if (IS_ERR_OR_NULL(*page))
1203                 return 1;
1204
1205         p = kmap(*page);
1206
1207         /* align our pointer to the offset of the super block */
1208         *disk_super = p + (bytenr & ~PAGE_MASK);
1209
1210         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1211             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1212                 btrfs_release_disk_super(*page);
1213                 return 1;
1214         }
1215
1216         if ((*disk_super)->label[0] &&
1217                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1218                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1219
1220         return 0;
1221 }
1222
1223 /*
1224  * Look for a btrfs signature on a device. This may be called out of the mount path
1225  * and we are not allowed to call set_blocksize during the scan. The superblock
1226  * is read via pagecache
1227  */
1228 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1229                           struct btrfs_fs_devices **fs_devices_ret)
1230 {
1231         struct btrfs_super_block *disk_super;
1232         struct btrfs_device *device;
1233         struct block_device *bdev;
1234         struct page *page;
1235         int ret = 0;
1236         u64 bytenr;
1237
1238         /*
1239          * we would like to check all the supers, but that would make
1240          * a btrfs mount succeed after a mkfs from a different FS.
1241          * So, we need to add a special mount option to scan for
1242          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1243          */
1244         bytenr = btrfs_sb_offset(0);
1245         flags |= FMODE_EXCL;
1246         mutex_lock(&uuid_mutex);
1247
1248         bdev = blkdev_get_by_path(path, flags, holder);
1249         if (IS_ERR(bdev)) {
1250                 ret = PTR_ERR(bdev);
1251                 goto error;
1252         }
1253
1254         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1255                 ret = -EINVAL;
1256                 goto error_bdev_put;
1257         }
1258
1259         device = device_list_add(path, disk_super);
1260         if (IS_ERR(device))
1261                 ret = PTR_ERR(device);
1262         else
1263                 *fs_devices_ret = device->fs_devices;
1264
1265         btrfs_release_disk_super(page);
1266
1267 error_bdev_put:
1268         blkdev_put(bdev, flags);
1269 error:
1270         mutex_unlock(&uuid_mutex);
1271         return ret;
1272 }
1273
1274 /* helper to account the used device space in the range */
1275 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1276                                    u64 end, u64 *length)
1277 {
1278         struct btrfs_key key;
1279         struct btrfs_root *root = device->fs_info->dev_root;
1280         struct btrfs_dev_extent *dev_extent;
1281         struct btrfs_path *path;
1282         u64 extent_end;
1283         int ret;
1284         int slot;
1285         struct extent_buffer *l;
1286
1287         *length = 0;
1288
1289         if (start >= device->total_bytes ||
1290                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1291                 return 0;
1292
1293         path = btrfs_alloc_path();
1294         if (!path)
1295                 return -ENOMEM;
1296         path->reada = READA_FORWARD;
1297
1298         key.objectid = device->devid;
1299         key.offset = start;
1300         key.type = BTRFS_DEV_EXTENT_KEY;
1301
1302         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1303         if (ret < 0)
1304                 goto out;
1305         if (ret > 0) {
1306                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1307                 if (ret < 0)
1308                         goto out;
1309         }
1310
1311         while (1) {
1312                 l = path->nodes[0];
1313                 slot = path->slots[0];
1314                 if (slot >= btrfs_header_nritems(l)) {
1315                         ret = btrfs_next_leaf(root, path);
1316                         if (ret == 0)
1317                                 continue;
1318                         if (ret < 0)
1319                                 goto out;
1320
1321                         break;
1322                 }
1323                 btrfs_item_key_to_cpu(l, &key, slot);
1324
1325                 if (key.objectid < device->devid)
1326                         goto next;
1327
1328                 if (key.objectid > device->devid)
1329                         break;
1330
1331                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1332                         goto next;
1333
1334                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1335                 extent_end = key.offset + btrfs_dev_extent_length(l,
1336                                                                   dev_extent);
1337                 if (key.offset <= start && extent_end > end) {
1338                         *length = end - start + 1;
1339                         break;
1340                 } else if (key.offset <= start && extent_end > start)
1341                         *length += extent_end - start;
1342                 else if (key.offset > start && extent_end <= end)
1343                         *length += extent_end - key.offset;
1344                 else if (key.offset > start && key.offset <= end) {
1345                         *length += end - key.offset + 1;
1346                         break;
1347                 } else if (key.offset > end)
1348                         break;
1349
1350 next:
1351                 path->slots[0]++;
1352         }
1353         ret = 0;
1354 out:
1355         btrfs_free_path(path);
1356         return ret;
1357 }
1358
1359 static int contains_pending_extent(struct btrfs_transaction *transaction,
1360                                    struct btrfs_device *device,
1361                                    u64 *start, u64 len)
1362 {
1363         struct btrfs_fs_info *fs_info = device->fs_info;
1364         struct extent_map *em;
1365         struct list_head *search_list = &fs_info->pinned_chunks;
1366         int ret = 0;
1367         u64 physical_start = *start;
1368
1369         if (transaction)
1370                 search_list = &transaction->pending_chunks;
1371 again:
1372         list_for_each_entry(em, search_list, list) {
1373                 struct map_lookup *map;
1374                 int i;
1375
1376                 map = em->map_lookup;
1377                 for (i = 0; i < map->num_stripes; i++) {
1378                         u64 end;
1379
1380                         if (map->stripes[i].dev != device)
1381                                 continue;
1382                         if (map->stripes[i].physical >= physical_start + len ||
1383                             map->stripes[i].physical + em->orig_block_len <=
1384                             physical_start)
1385                                 continue;
1386                         /*
1387                          * Make sure that while processing the pinned list we do
1388                          * not override our *start with a lower value, because
1389                          * we can have pinned chunks that fall within this
1390                          * device hole and that have lower physical addresses
1391                          * than the pending chunks we processed before. If we
1392                          * do not take this special care we can end up getting
1393                          * 2 pending chunks that start at the same physical
1394                          * device offsets because the end offset of a pinned
1395                          * chunk can be equal to the start offset of some
1396                          * pending chunk.
1397                          */
1398                         end = map->stripes[i].physical + em->orig_block_len;
1399                         if (end > *start) {
1400                                 *start = end;
1401                                 ret = 1;
1402                         }
1403                 }
1404         }
1405         if (search_list != &fs_info->pinned_chunks) {
1406                 search_list = &fs_info->pinned_chunks;
1407                 goto again;
1408         }
1409
1410         return ret;
1411 }
1412
1413
1414 /*
1415  * find_free_dev_extent_start - find free space in the specified device
1416  * @device:       the device which we search the free space in
1417  * @num_bytes:    the size of the free space that we need
1418  * @search_start: the position from which to begin the search
1419  * @start:        store the start of the free space.
1420  * @len:          the size of the free space. that we find, or the size
1421  *                of the max free space if we don't find suitable free space
1422  *
1423  * this uses a pretty simple search, the expectation is that it is
1424  * called very infrequently and that a given device has a small number
1425  * of extents
1426  *
1427  * @start is used to store the start of the free space if we find. But if we
1428  * don't find suitable free space, it will be used to store the start position
1429  * of the max free space.
1430  *
1431  * @len is used to store the size of the free space that we find.
1432  * But if we don't find suitable free space, it is used to store the size of
1433  * the max free space.
1434  */
1435 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1436                                struct btrfs_device *device, u64 num_bytes,
1437                                u64 search_start, u64 *start, u64 *len)
1438 {
1439         struct btrfs_fs_info *fs_info = device->fs_info;
1440         struct btrfs_root *root = fs_info->dev_root;
1441         struct btrfs_key key;
1442         struct btrfs_dev_extent *dev_extent;
1443         struct btrfs_path *path;
1444         u64 hole_size;
1445         u64 max_hole_start;
1446         u64 max_hole_size;
1447         u64 extent_end;
1448         u64 search_end = device->total_bytes;
1449         int ret;
1450         int slot;
1451         struct extent_buffer *l;
1452
1453         /*
1454          * We don't want to overwrite the superblock on the drive nor any area
1455          * used by the boot loader (grub for example), so we make sure to start
1456          * at an offset of at least 1MB.
1457          */
1458         search_start = max_t(u64, search_start, SZ_1M);
1459
1460         path = btrfs_alloc_path();
1461         if (!path)
1462                 return -ENOMEM;
1463
1464         max_hole_start = search_start;
1465         max_hole_size = 0;
1466
1467 again:
1468         if (search_start >= search_end ||
1469                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1470                 ret = -ENOSPC;
1471                 goto out;
1472         }
1473
1474         path->reada = READA_FORWARD;
1475         path->search_commit_root = 1;
1476         path->skip_locking = 1;
1477
1478         key.objectid = device->devid;
1479         key.offset = search_start;
1480         key.type = BTRFS_DEV_EXTENT_KEY;
1481
1482         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1483         if (ret < 0)
1484                 goto out;
1485         if (ret > 0) {
1486                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1487                 if (ret < 0)
1488                         goto out;
1489         }
1490
1491         while (1) {
1492                 l = path->nodes[0];
1493                 slot = path->slots[0];
1494                 if (slot >= btrfs_header_nritems(l)) {
1495                         ret = btrfs_next_leaf(root, path);
1496                         if (ret == 0)
1497                                 continue;
1498                         if (ret < 0)
1499                                 goto out;
1500
1501                         break;
1502                 }
1503                 btrfs_item_key_to_cpu(l, &key, slot);
1504
1505                 if (key.objectid < device->devid)
1506                         goto next;
1507
1508                 if (key.objectid > device->devid)
1509                         break;
1510
1511                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1512                         goto next;
1513
1514                 if (key.offset > search_start) {
1515                         hole_size = key.offset - search_start;
1516
1517                         /*
1518                          * Have to check before we set max_hole_start, otherwise
1519                          * we could end up sending back this offset anyway.
1520                          */
1521                         if (contains_pending_extent(transaction, device,
1522                                                     &search_start,
1523                                                     hole_size)) {
1524                                 if (key.offset >= search_start) {
1525                                         hole_size = key.offset - search_start;
1526                                 } else {
1527                                         WARN_ON_ONCE(1);
1528                                         hole_size = 0;
1529                                 }
1530                         }
1531
1532                         if (hole_size > max_hole_size) {
1533                                 max_hole_start = search_start;
1534                                 max_hole_size = hole_size;
1535                         }
1536
1537                         /*
1538                          * If this free space is greater than which we need,
1539                          * it must be the max free space that we have found
1540                          * until now, so max_hole_start must point to the start
1541                          * of this free space and the length of this free space
1542                          * is stored in max_hole_size. Thus, we return
1543                          * max_hole_start and max_hole_size and go back to the
1544                          * caller.
1545                          */
1546                         if (hole_size >= num_bytes) {
1547                                 ret = 0;
1548                                 goto out;
1549                         }
1550                 }
1551
1552                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1553                 extent_end = key.offset + btrfs_dev_extent_length(l,
1554                                                                   dev_extent);
1555                 if (extent_end > search_start)
1556                         search_start = extent_end;
1557 next:
1558                 path->slots[0]++;
1559                 cond_resched();
1560         }
1561
1562         /*
1563          * At this point, search_start should be the end of
1564          * allocated dev extents, and when shrinking the device,
1565          * search_end may be smaller than search_start.
1566          */
1567         if (search_end > search_start) {
1568                 hole_size = search_end - search_start;
1569
1570                 if (contains_pending_extent(transaction, device, &search_start,
1571                                             hole_size)) {
1572                         btrfs_release_path(path);
1573                         goto again;
1574                 }
1575
1576                 if (hole_size > max_hole_size) {
1577                         max_hole_start = search_start;
1578                         max_hole_size = hole_size;
1579                 }
1580         }
1581
1582         /* See above. */
1583         if (max_hole_size < num_bytes)
1584                 ret = -ENOSPC;
1585         else
1586                 ret = 0;
1587
1588 out:
1589         btrfs_free_path(path);
1590         *start = max_hole_start;
1591         if (len)
1592                 *len = max_hole_size;
1593         return ret;
1594 }
1595
1596 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1597                          struct btrfs_device *device, u64 num_bytes,
1598                          u64 *start, u64 *len)
1599 {
1600         /* FIXME use last free of some kind */
1601         return find_free_dev_extent_start(trans->transaction, device,
1602                                           num_bytes, 0, start, len);
1603 }
1604
1605 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1606                           struct btrfs_device *device,
1607                           u64 start, u64 *dev_extent_len)
1608 {
1609         struct btrfs_fs_info *fs_info = device->fs_info;
1610         struct btrfs_root *root = fs_info->dev_root;
1611         int ret;
1612         struct btrfs_path *path;
1613         struct btrfs_key key;
1614         struct btrfs_key found_key;
1615         struct extent_buffer *leaf = NULL;
1616         struct btrfs_dev_extent *extent = NULL;
1617
1618         path = btrfs_alloc_path();
1619         if (!path)
1620                 return -ENOMEM;
1621
1622         key.objectid = device->devid;
1623         key.offset = start;
1624         key.type = BTRFS_DEV_EXTENT_KEY;
1625 again:
1626         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1627         if (ret > 0) {
1628                 ret = btrfs_previous_item(root, path, key.objectid,
1629                                           BTRFS_DEV_EXTENT_KEY);
1630                 if (ret)
1631                         goto out;
1632                 leaf = path->nodes[0];
1633                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1634                 extent = btrfs_item_ptr(leaf, path->slots[0],
1635                                         struct btrfs_dev_extent);
1636                 BUG_ON(found_key.offset > start || found_key.offset +
1637                        btrfs_dev_extent_length(leaf, extent) < start);
1638                 key = found_key;
1639                 btrfs_release_path(path);
1640                 goto again;
1641         } else if (ret == 0) {
1642                 leaf = path->nodes[0];
1643                 extent = btrfs_item_ptr(leaf, path->slots[0],
1644                                         struct btrfs_dev_extent);
1645         } else {
1646                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1647                 goto out;
1648         }
1649
1650         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1651
1652         ret = btrfs_del_item(trans, root, path);
1653         if (ret) {
1654                 btrfs_handle_fs_error(fs_info, ret,
1655                                       "Failed to remove dev extent item");
1656         } else {
1657                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1658         }
1659 out:
1660         btrfs_free_path(path);
1661         return ret;
1662 }
1663
1664 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1665                                   struct btrfs_device *device,
1666                                   u64 chunk_offset, u64 start, u64 num_bytes)
1667 {
1668         int ret;
1669         struct btrfs_path *path;
1670         struct btrfs_fs_info *fs_info = device->fs_info;
1671         struct btrfs_root *root = fs_info->dev_root;
1672         struct btrfs_dev_extent *extent;
1673         struct extent_buffer *leaf;
1674         struct btrfs_key key;
1675
1676         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1677         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1678         path = btrfs_alloc_path();
1679         if (!path)
1680                 return -ENOMEM;
1681
1682         key.objectid = device->devid;
1683         key.offset = start;
1684         key.type = BTRFS_DEV_EXTENT_KEY;
1685         ret = btrfs_insert_empty_item(trans, root, path, &key,
1686                                       sizeof(*extent));
1687         if (ret)
1688                 goto out;
1689
1690         leaf = path->nodes[0];
1691         extent = btrfs_item_ptr(leaf, path->slots[0],
1692                                 struct btrfs_dev_extent);
1693         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1694                                         BTRFS_CHUNK_TREE_OBJECTID);
1695         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1696                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1697         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1698
1699         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1700         btrfs_mark_buffer_dirty(leaf);
1701 out:
1702         btrfs_free_path(path);
1703         return ret;
1704 }
1705
1706 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1707 {
1708         struct extent_map_tree *em_tree;
1709         struct extent_map *em;
1710         struct rb_node *n;
1711         u64 ret = 0;
1712
1713         em_tree = &fs_info->mapping_tree.map_tree;
1714         read_lock(&em_tree->lock);
1715         n = rb_last(&em_tree->map);
1716         if (n) {
1717                 em = rb_entry(n, struct extent_map, rb_node);
1718                 ret = em->start + em->len;
1719         }
1720         read_unlock(&em_tree->lock);
1721
1722         return ret;
1723 }
1724
1725 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1726                                     u64 *devid_ret)
1727 {
1728         int ret;
1729         struct btrfs_key key;
1730         struct btrfs_key found_key;
1731         struct btrfs_path *path;
1732
1733         path = btrfs_alloc_path();
1734         if (!path)
1735                 return -ENOMEM;
1736
1737         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1738         key.type = BTRFS_DEV_ITEM_KEY;
1739         key.offset = (u64)-1;
1740
1741         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1742         if (ret < 0)
1743                 goto error;
1744
1745         BUG_ON(ret == 0); /* Corruption */
1746
1747         ret = btrfs_previous_item(fs_info->chunk_root, path,
1748                                   BTRFS_DEV_ITEMS_OBJECTID,
1749                                   BTRFS_DEV_ITEM_KEY);
1750         if (ret) {
1751                 *devid_ret = 1;
1752         } else {
1753                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1754                                       path->slots[0]);
1755                 *devid_ret = found_key.offset + 1;
1756         }
1757         ret = 0;
1758 error:
1759         btrfs_free_path(path);
1760         return ret;
1761 }
1762
1763 /*
1764  * the device information is stored in the chunk root
1765  * the btrfs_device struct should be fully filled in
1766  */
1767 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1768                             struct btrfs_fs_info *fs_info,
1769                             struct btrfs_device *device)
1770 {
1771         struct btrfs_root *root = fs_info->chunk_root;
1772         int ret;
1773         struct btrfs_path *path;
1774         struct btrfs_dev_item *dev_item;
1775         struct extent_buffer *leaf;
1776         struct btrfs_key key;
1777         unsigned long ptr;
1778
1779         path = btrfs_alloc_path();
1780         if (!path)
1781                 return -ENOMEM;
1782
1783         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1784         key.type = BTRFS_DEV_ITEM_KEY;
1785         key.offset = device->devid;
1786
1787         ret = btrfs_insert_empty_item(trans, root, path, &key,
1788                                       sizeof(*dev_item));
1789         if (ret)
1790                 goto out;
1791
1792         leaf = path->nodes[0];
1793         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1794
1795         btrfs_set_device_id(leaf, dev_item, device->devid);
1796         btrfs_set_device_generation(leaf, dev_item, 0);
1797         btrfs_set_device_type(leaf, dev_item, device->type);
1798         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1799         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1800         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1801         btrfs_set_device_total_bytes(leaf, dev_item,
1802                                      btrfs_device_get_disk_total_bytes(device));
1803         btrfs_set_device_bytes_used(leaf, dev_item,
1804                                     btrfs_device_get_bytes_used(device));
1805         btrfs_set_device_group(leaf, dev_item, 0);
1806         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1807         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1808         btrfs_set_device_start_offset(leaf, dev_item, 0);
1809
1810         ptr = btrfs_device_uuid(dev_item);
1811         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1812         ptr = btrfs_device_fsid(dev_item);
1813         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1814         btrfs_mark_buffer_dirty(leaf);
1815
1816         ret = 0;
1817 out:
1818         btrfs_free_path(path);
1819         return ret;
1820 }
1821
1822 /*
1823  * Function to update ctime/mtime for a given device path.
1824  * Mainly used for ctime/mtime based probe like libblkid.
1825  */
1826 static void update_dev_time(const char *path_name)
1827 {
1828         struct file *filp;
1829
1830         filp = filp_open(path_name, O_RDWR, 0);
1831         if (IS_ERR(filp))
1832                 return;
1833         file_update_time(filp);
1834         filp_close(filp, NULL);
1835 }
1836
1837 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1838                              struct btrfs_device *device)
1839 {
1840         struct btrfs_root *root = fs_info->chunk_root;
1841         int ret;
1842         struct btrfs_path *path;
1843         struct btrfs_key key;
1844         struct btrfs_trans_handle *trans;
1845
1846         path = btrfs_alloc_path();
1847         if (!path)
1848                 return -ENOMEM;
1849
1850         trans = btrfs_start_transaction(root, 0);
1851         if (IS_ERR(trans)) {
1852                 btrfs_free_path(path);
1853                 return PTR_ERR(trans);
1854         }
1855         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1856         key.type = BTRFS_DEV_ITEM_KEY;
1857         key.offset = device->devid;
1858
1859         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1860         if (ret) {
1861                 if (ret > 0)
1862                         ret = -ENOENT;
1863                 btrfs_abort_transaction(trans, ret);
1864                 btrfs_end_transaction(trans);
1865                 goto out;
1866         }
1867
1868         ret = btrfs_del_item(trans, root, path);
1869         if (ret) {
1870                 btrfs_abort_transaction(trans, ret);
1871                 btrfs_end_transaction(trans);
1872         }
1873
1874 out:
1875         btrfs_free_path(path);
1876         if (!ret)
1877                 ret = btrfs_commit_transaction(trans);
1878         return ret;
1879 }
1880
1881 /*
1882  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1883  * filesystem. It's up to the caller to adjust that number regarding eg. device
1884  * replace.
1885  */
1886 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1887                 u64 num_devices)
1888 {
1889         u64 all_avail;
1890         unsigned seq;
1891         int i;
1892
1893         do {
1894                 seq = read_seqbegin(&fs_info->profiles_lock);
1895
1896                 all_avail = fs_info->avail_data_alloc_bits |
1897                             fs_info->avail_system_alloc_bits |
1898                             fs_info->avail_metadata_alloc_bits;
1899         } while (read_seqretry(&fs_info->profiles_lock, seq));
1900
1901         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1902                 if (!(all_avail & btrfs_raid_group[i]))
1903                         continue;
1904
1905                 if (num_devices < btrfs_raid_array[i].devs_min) {
1906                         int ret = btrfs_raid_mindev_error[i];
1907
1908                         if (ret)
1909                                 return ret;
1910                 }
1911         }
1912
1913         return 0;
1914 }
1915
1916 static struct btrfs_device * btrfs_find_next_active_device(
1917                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1918 {
1919         struct btrfs_device *next_device;
1920
1921         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1922                 if (next_device != device &&
1923                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1924                     && next_device->bdev)
1925                         return next_device;
1926         }
1927
1928         return NULL;
1929 }
1930
1931 /*
1932  * Helper function to check if the given device is part of s_bdev / latest_bdev
1933  * and replace it with the provided or the next active device, in the context
1934  * where this function called, there should be always be another device (or
1935  * this_dev) which is active.
1936  */
1937 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1938                 struct btrfs_device *device, struct btrfs_device *this_dev)
1939 {
1940         struct btrfs_device *next_device;
1941
1942         if (this_dev)
1943                 next_device = this_dev;
1944         else
1945                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1946                                                                 device);
1947         ASSERT(next_device);
1948
1949         if (fs_info->sb->s_bdev &&
1950                         (fs_info->sb->s_bdev == device->bdev))
1951                 fs_info->sb->s_bdev = next_device->bdev;
1952
1953         if (fs_info->fs_devices->latest_bdev == device->bdev)
1954                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1955 }
1956
1957 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1958                 u64 devid)
1959 {
1960         struct btrfs_device *device;
1961         struct btrfs_fs_devices *cur_devices;
1962         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1963         u64 num_devices;
1964         int ret = 0;
1965
1966         mutex_lock(&uuid_mutex);
1967
1968         num_devices = fs_devices->num_devices;
1969         btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1970         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1971                 WARN_ON(num_devices < 1);
1972                 num_devices--;
1973         }
1974         btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1975
1976         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1977         if (ret)
1978                 goto out;
1979
1980         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1981                                            &device);
1982         if (ret)
1983                 goto out;
1984
1985         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1986                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1987                 goto out;
1988         }
1989
1990         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1991             fs_info->fs_devices->rw_devices == 1) {
1992                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1993                 goto out;
1994         }
1995
1996         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1997                 mutex_lock(&fs_info->chunk_mutex);
1998                 list_del_init(&device->dev_alloc_list);
1999                 device->fs_devices->rw_devices--;
2000                 mutex_unlock(&fs_info->chunk_mutex);
2001         }
2002
2003         mutex_unlock(&uuid_mutex);
2004         ret = btrfs_shrink_device(device, 0);
2005         mutex_lock(&uuid_mutex);
2006         if (ret)
2007                 goto error_undo;
2008
2009         /*
2010          * TODO: the superblock still includes this device in its num_devices
2011          * counter although write_all_supers() is not locked out. This
2012          * could give a filesystem state which requires a degraded mount.
2013          */
2014         ret = btrfs_rm_dev_item(fs_info, device);
2015         if (ret)
2016                 goto error_undo;
2017
2018         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2019         btrfs_scrub_cancel_dev(fs_info, device);
2020
2021         /*
2022          * the device list mutex makes sure that we don't change
2023          * the device list while someone else is writing out all
2024          * the device supers. Whoever is writing all supers, should
2025          * lock the device list mutex before getting the number of
2026          * devices in the super block (super_copy). Conversely,
2027          * whoever updates the number of devices in the super block
2028          * (super_copy) should hold the device list mutex.
2029          */
2030
2031         cur_devices = device->fs_devices;
2032         mutex_lock(&fs_devices->device_list_mutex);
2033         list_del_rcu(&device->dev_list);
2034
2035         device->fs_devices->num_devices--;
2036         device->fs_devices->total_devices--;
2037
2038         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2039                 device->fs_devices->missing_devices--;
2040
2041         btrfs_assign_next_active_device(fs_info, device, NULL);
2042
2043         if (device->bdev) {
2044                 device->fs_devices->open_devices--;
2045                 /* remove sysfs entry */
2046                 btrfs_sysfs_rm_device_link(fs_devices, device);
2047         }
2048
2049         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2050         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2051         mutex_unlock(&fs_devices->device_list_mutex);
2052
2053         /*
2054          * at this point, the device is zero sized and detached from
2055          * the devices list.  All that's left is to zero out the old
2056          * supers and free the device.
2057          */
2058         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2059                 btrfs_scratch_superblocks(device->bdev, device->name->str);
2060
2061         btrfs_close_bdev(device);
2062         call_rcu(&device->rcu, free_device_rcu);
2063
2064         if (cur_devices->open_devices == 0) {
2065                 while (fs_devices) {
2066                         if (fs_devices->seed == cur_devices) {
2067                                 fs_devices->seed = cur_devices->seed;
2068                                 break;
2069                         }
2070                         fs_devices = fs_devices->seed;
2071                 }
2072                 cur_devices->seed = NULL;
2073                 close_fs_devices(cur_devices);
2074                 free_fs_devices(cur_devices);
2075         }
2076
2077 out:
2078         mutex_unlock(&uuid_mutex);
2079         return ret;
2080
2081 error_undo:
2082         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2083                 mutex_lock(&fs_info->chunk_mutex);
2084                 list_add(&device->dev_alloc_list,
2085                          &fs_devices->alloc_list);
2086                 device->fs_devices->rw_devices++;
2087                 mutex_unlock(&fs_info->chunk_mutex);
2088         }
2089         goto out;
2090 }
2091
2092 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
2093                                         struct btrfs_device *srcdev)
2094 {
2095         struct btrfs_fs_devices *fs_devices;
2096
2097         lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2098
2099         /*
2100          * in case of fs with no seed, srcdev->fs_devices will point
2101          * to fs_devices of fs_info. However when the dev being replaced is
2102          * a seed dev it will point to the seed's local fs_devices. In short
2103          * srcdev will have its correct fs_devices in both the cases.
2104          */
2105         fs_devices = srcdev->fs_devices;
2106
2107         list_del_rcu(&srcdev->dev_list);
2108         list_del(&srcdev->dev_alloc_list);
2109         fs_devices->num_devices--;
2110         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2111                 fs_devices->missing_devices--;
2112
2113         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2114                 fs_devices->rw_devices--;
2115
2116         if (srcdev->bdev)
2117                 fs_devices->open_devices--;
2118 }
2119
2120 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2121                                       struct btrfs_device *srcdev)
2122 {
2123         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2124
2125         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2126                 /* zero out the old super if it is writable */
2127                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2128         }
2129
2130         btrfs_close_bdev(srcdev);
2131         call_rcu(&srcdev->rcu, free_device_rcu);
2132
2133         /* if this is no devs we rather delete the fs_devices */
2134         if (!fs_devices->num_devices) {
2135                 struct btrfs_fs_devices *tmp_fs_devices;
2136
2137                 /*
2138                  * On a mounted FS, num_devices can't be zero unless it's a
2139                  * seed. In case of a seed device being replaced, the replace
2140                  * target added to the sprout FS, so there will be no more
2141                  * device left under the seed FS.
2142                  */
2143                 ASSERT(fs_devices->seeding);
2144
2145                 tmp_fs_devices = fs_info->fs_devices;
2146                 while (tmp_fs_devices) {
2147                         if (tmp_fs_devices->seed == fs_devices) {
2148                                 tmp_fs_devices->seed = fs_devices->seed;
2149                                 break;
2150                         }
2151                         tmp_fs_devices = tmp_fs_devices->seed;
2152                 }
2153                 fs_devices->seed = NULL;
2154                 close_fs_devices(fs_devices);
2155                 free_fs_devices(fs_devices);
2156         }
2157 }
2158
2159 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2160                                       struct btrfs_device *tgtdev)
2161 {
2162         mutex_lock(&uuid_mutex);
2163         WARN_ON(!tgtdev);
2164         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2165
2166         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2167
2168         if (tgtdev->bdev)
2169                 fs_info->fs_devices->open_devices--;
2170
2171         fs_info->fs_devices->num_devices--;
2172
2173         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2174
2175         list_del_rcu(&tgtdev->dev_list);
2176
2177         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2178         mutex_unlock(&uuid_mutex);
2179
2180         /*
2181          * The update_dev_time() with in btrfs_scratch_superblocks()
2182          * may lead to a call to btrfs_show_devname() which will try
2183          * to hold device_list_mutex. And here this device
2184          * is already out of device list, so we don't have to hold
2185          * the device_list_mutex lock.
2186          */
2187         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2188
2189         btrfs_close_bdev(tgtdev);
2190         call_rcu(&tgtdev->rcu, free_device_rcu);
2191 }
2192
2193 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2194                                      const char *device_path,
2195                                      struct btrfs_device **device)
2196 {
2197         int ret = 0;
2198         struct btrfs_super_block *disk_super;
2199         u64 devid;
2200         u8 *dev_uuid;
2201         struct block_device *bdev;
2202         struct buffer_head *bh;
2203
2204         *device = NULL;
2205         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2206                                     fs_info->bdev_holder, 0, &bdev, &bh);
2207         if (ret)
2208                 return ret;
2209         disk_super = (struct btrfs_super_block *)bh->b_data;
2210         devid = btrfs_stack_device_id(&disk_super->dev_item);
2211         dev_uuid = disk_super->dev_item.uuid;
2212         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2213         brelse(bh);
2214         if (!*device)
2215                 ret = -ENOENT;
2216         blkdev_put(bdev, FMODE_READ);
2217         return ret;
2218 }
2219
2220 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2221                                          const char *device_path,
2222                                          struct btrfs_device **device)
2223 {
2224         *device = NULL;
2225         if (strcmp(device_path, "missing") == 0) {
2226                 struct list_head *devices;
2227                 struct btrfs_device *tmp;
2228
2229                 devices = &fs_info->fs_devices->devices;
2230                 list_for_each_entry(tmp, devices, dev_list) {
2231                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2232                                         &tmp->dev_state) && !tmp->bdev) {
2233                                 *device = tmp;
2234                                 break;
2235                         }
2236                 }
2237
2238                 if (!*device)
2239                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2240
2241                 return 0;
2242         } else {
2243                 return btrfs_find_device_by_path(fs_info, device_path, device);
2244         }
2245 }
2246
2247 /*
2248  * Lookup a device given by device id, or the path if the id is 0.
2249  */
2250 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2251                                  const char *devpath,
2252                                  struct btrfs_device **device)
2253 {
2254         int ret;
2255
2256         if (devid) {
2257                 ret = 0;
2258                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2259                 if (!*device)
2260                         ret = -ENOENT;
2261         } else {
2262                 if (!devpath || !devpath[0])
2263                         return -EINVAL;
2264
2265                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2266                                                            device);
2267         }
2268         return ret;
2269 }
2270
2271 /*
2272  * does all the dirty work required for changing file system's UUID.
2273  */
2274 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2275 {
2276         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2277         struct btrfs_fs_devices *old_devices;
2278         struct btrfs_fs_devices *seed_devices;
2279         struct btrfs_super_block *disk_super = fs_info->super_copy;
2280         struct btrfs_device *device;
2281         u64 super_flags;
2282
2283         lockdep_assert_held(&uuid_mutex);
2284         if (!fs_devices->seeding)
2285                 return -EINVAL;
2286
2287         seed_devices = alloc_fs_devices(NULL);
2288         if (IS_ERR(seed_devices))
2289                 return PTR_ERR(seed_devices);
2290
2291         old_devices = clone_fs_devices(fs_devices);
2292         if (IS_ERR(old_devices)) {
2293                 kfree(seed_devices);
2294                 return PTR_ERR(old_devices);
2295         }
2296
2297         list_add(&old_devices->fs_list, &fs_uuids);
2298
2299         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2300         seed_devices->opened = 1;
2301         INIT_LIST_HEAD(&seed_devices->devices);
2302         INIT_LIST_HEAD(&seed_devices->alloc_list);
2303         mutex_init(&seed_devices->device_list_mutex);
2304
2305         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2306         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2307                               synchronize_rcu);
2308         list_for_each_entry(device, &seed_devices->devices, dev_list)
2309                 device->fs_devices = seed_devices;
2310
2311         mutex_lock(&fs_info->chunk_mutex);
2312         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2313         mutex_unlock(&fs_info->chunk_mutex);
2314
2315         fs_devices->seeding = 0;
2316         fs_devices->num_devices = 0;
2317         fs_devices->open_devices = 0;
2318         fs_devices->missing_devices = 0;
2319         fs_devices->rotating = 0;
2320         fs_devices->seed = seed_devices;
2321
2322         generate_random_uuid(fs_devices->fsid);
2323         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2324         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2325         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2326
2327         super_flags = btrfs_super_flags(disk_super) &
2328                       ~BTRFS_SUPER_FLAG_SEEDING;
2329         btrfs_set_super_flags(disk_super, super_flags);
2330
2331         return 0;
2332 }
2333
2334 /*
2335  * Store the expected generation for seed devices in device items.
2336  */
2337 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2338                                struct btrfs_fs_info *fs_info)
2339 {
2340         struct btrfs_root *root = fs_info->chunk_root;
2341         struct btrfs_path *path;
2342         struct extent_buffer *leaf;
2343         struct btrfs_dev_item *dev_item;
2344         struct btrfs_device *device;
2345         struct btrfs_key key;
2346         u8 fs_uuid[BTRFS_FSID_SIZE];
2347         u8 dev_uuid[BTRFS_UUID_SIZE];
2348         u64 devid;
2349         int ret;
2350
2351         path = btrfs_alloc_path();
2352         if (!path)
2353                 return -ENOMEM;
2354
2355         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2356         key.offset = 0;
2357         key.type = BTRFS_DEV_ITEM_KEY;
2358
2359         while (1) {
2360                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2361                 if (ret < 0)
2362                         goto error;
2363
2364                 leaf = path->nodes[0];
2365 next_slot:
2366                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2367                         ret = btrfs_next_leaf(root, path);
2368                         if (ret > 0)
2369                                 break;
2370                         if (ret < 0)
2371                                 goto error;
2372                         leaf = path->nodes[0];
2373                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2374                         btrfs_release_path(path);
2375                         continue;
2376                 }
2377
2378                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2379                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2380                     key.type != BTRFS_DEV_ITEM_KEY)
2381                         break;
2382
2383                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2384                                           struct btrfs_dev_item);
2385                 devid = btrfs_device_id(leaf, dev_item);
2386                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2387                                    BTRFS_UUID_SIZE);
2388                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2389                                    BTRFS_FSID_SIZE);
2390                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2391                 BUG_ON(!device); /* Logic error */
2392
2393                 if (device->fs_devices->seeding) {
2394                         btrfs_set_device_generation(leaf, dev_item,
2395                                                     device->generation);
2396                         btrfs_mark_buffer_dirty(leaf);
2397                 }
2398
2399                 path->slots[0]++;
2400                 goto next_slot;
2401         }
2402         ret = 0;
2403 error:
2404         btrfs_free_path(path);
2405         return ret;
2406 }
2407
2408 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2409 {
2410         struct btrfs_root *root = fs_info->dev_root;
2411         struct request_queue *q;
2412         struct btrfs_trans_handle *trans;
2413         struct btrfs_device *device;
2414         struct block_device *bdev;
2415         struct list_head *devices;
2416         struct super_block *sb = fs_info->sb;
2417         struct rcu_string *name;
2418         u64 tmp;
2419         int seeding_dev = 0;
2420         int ret = 0;
2421         bool unlocked = false;
2422
2423         if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2424                 return -EROFS;
2425
2426         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2427                                   fs_info->bdev_holder);
2428         if (IS_ERR(bdev))
2429                 return PTR_ERR(bdev);
2430
2431         if (fs_info->fs_devices->seeding) {
2432                 seeding_dev = 1;
2433                 down_write(&sb->s_umount);
2434                 mutex_lock(&uuid_mutex);
2435         }
2436
2437         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2438
2439         devices = &fs_info->fs_devices->devices;
2440
2441         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2442         list_for_each_entry(device, devices, dev_list) {
2443                 if (device->bdev == bdev) {
2444                         ret = -EEXIST;
2445                         mutex_unlock(
2446                                 &fs_info->fs_devices->device_list_mutex);
2447                         goto error;
2448                 }
2449         }
2450         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2451
2452         device = btrfs_alloc_device(fs_info, NULL, NULL);
2453         if (IS_ERR(device)) {
2454                 /* we can safely leave the fs_devices entry around */
2455                 ret = PTR_ERR(device);
2456                 goto error;
2457         }
2458
2459         name = rcu_string_strdup(device_path, GFP_KERNEL);
2460         if (!name) {
2461                 ret = -ENOMEM;
2462                 goto error_free_device;
2463         }
2464         rcu_assign_pointer(device->name, name);
2465
2466         trans = btrfs_start_transaction(root, 0);
2467         if (IS_ERR(trans)) {
2468                 ret = PTR_ERR(trans);
2469                 goto error_free_device;
2470         }
2471
2472         q = bdev_get_queue(bdev);
2473         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2474         device->generation = trans->transid;
2475         device->io_width = fs_info->sectorsize;
2476         device->io_align = fs_info->sectorsize;
2477         device->sector_size = fs_info->sectorsize;
2478         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2479                                          fs_info->sectorsize);
2480         device->disk_total_bytes = device->total_bytes;
2481         device->commit_total_bytes = device->total_bytes;
2482         device->fs_info = fs_info;
2483         device->bdev = bdev;
2484         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2485         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2486         device->mode = FMODE_EXCL;
2487         device->dev_stats_valid = 1;
2488         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2489
2490         if (seeding_dev) {
2491                 sb->s_flags &= ~SB_RDONLY;
2492                 ret = btrfs_prepare_sprout(fs_info);
2493                 if (ret) {
2494                         btrfs_abort_transaction(trans, ret);
2495                         goto error_trans;
2496                 }
2497         }
2498
2499         device->fs_devices = fs_info->fs_devices;
2500
2501         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2502         mutex_lock(&fs_info->chunk_mutex);
2503         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2504         list_add(&device->dev_alloc_list,
2505                  &fs_info->fs_devices->alloc_list);
2506         fs_info->fs_devices->num_devices++;
2507         fs_info->fs_devices->open_devices++;
2508         fs_info->fs_devices->rw_devices++;
2509         fs_info->fs_devices->total_devices++;
2510         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2511
2512         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2513
2514         if (!blk_queue_nonrot(q))
2515                 fs_info->fs_devices->rotating = 1;
2516
2517         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2518         btrfs_set_super_total_bytes(fs_info->super_copy,
2519                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2520
2521         tmp = btrfs_super_num_devices(fs_info->super_copy);
2522         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2523
2524         /* add sysfs device entry */
2525         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2526
2527         /*
2528          * we've got more storage, clear any full flags on the space
2529          * infos
2530          */
2531         btrfs_clear_space_info_full(fs_info);
2532
2533         mutex_unlock(&fs_info->chunk_mutex);
2534         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2535
2536         if (seeding_dev) {
2537                 mutex_lock(&fs_info->chunk_mutex);
2538                 ret = init_first_rw_device(trans, fs_info);
2539                 mutex_unlock(&fs_info->chunk_mutex);
2540                 if (ret) {
2541                         btrfs_abort_transaction(trans, ret);
2542                         goto error_sysfs;
2543                 }
2544         }
2545
2546         ret = btrfs_add_dev_item(trans, fs_info, device);
2547         if (ret) {
2548                 btrfs_abort_transaction(trans, ret);
2549                 goto error_sysfs;
2550         }
2551
2552         if (seeding_dev) {
2553                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2554
2555                 ret = btrfs_finish_sprout(trans, fs_info);
2556                 if (ret) {
2557                         btrfs_abort_transaction(trans, ret);
2558                         goto error_sysfs;
2559                 }
2560
2561                 /* Sprouting would change fsid of the mounted root,
2562                  * so rename the fsid on the sysfs
2563                  */
2564                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2565                                                 fs_info->fsid);
2566                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2567                         btrfs_warn(fs_info,
2568                                    "sysfs: failed to create fsid for sprout");
2569         }
2570
2571         ret = btrfs_commit_transaction(trans);
2572
2573         if (seeding_dev) {
2574                 mutex_unlock(&uuid_mutex);
2575                 up_write(&sb->s_umount);
2576                 unlocked = true;
2577
2578                 if (ret) /* transaction commit */
2579                         return ret;
2580
2581                 ret = btrfs_relocate_sys_chunks(fs_info);
2582                 if (ret < 0)
2583                         btrfs_handle_fs_error(fs_info, ret,
2584                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2585                 trans = btrfs_attach_transaction(root);
2586                 if (IS_ERR(trans)) {
2587                         if (PTR_ERR(trans) == -ENOENT)
2588                                 return 0;
2589                         ret = PTR_ERR(trans);
2590                         trans = NULL;
2591                         goto error_sysfs;
2592                 }
2593                 ret = btrfs_commit_transaction(trans);
2594         }
2595
2596         /* Update ctime/mtime for libblkid */
2597         update_dev_time(device_path);
2598         return ret;
2599
2600 error_sysfs:
2601         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2602 error_trans:
2603         if (seeding_dev)
2604                 sb->s_flags |= SB_RDONLY;
2605         if (trans)
2606                 btrfs_end_transaction(trans);
2607 error_free_device:
2608         btrfs_free_device(device);
2609 error:
2610         blkdev_put(bdev, FMODE_EXCL);
2611         if (seeding_dev && !unlocked) {
2612                 mutex_unlock(&uuid_mutex);
2613                 up_write(&sb->s_umount);
2614         }
2615         return ret;
2616 }
2617
2618 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2619                                         struct btrfs_device *device)
2620 {
2621         int ret;
2622         struct btrfs_path *path;
2623         struct btrfs_root *root = device->fs_info->chunk_root;
2624         struct btrfs_dev_item *dev_item;
2625         struct extent_buffer *leaf;
2626         struct btrfs_key key;
2627
2628         path = btrfs_alloc_path();
2629         if (!path)
2630                 return -ENOMEM;
2631
2632         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2633         key.type = BTRFS_DEV_ITEM_KEY;
2634         key.offset = device->devid;
2635
2636         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2637         if (ret < 0)
2638                 goto out;
2639
2640         if (ret > 0) {
2641                 ret = -ENOENT;
2642                 goto out;
2643         }
2644
2645         leaf = path->nodes[0];
2646         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2647
2648         btrfs_set_device_id(leaf, dev_item, device->devid);
2649         btrfs_set_device_type(leaf, dev_item, device->type);
2650         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2651         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2652         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2653         btrfs_set_device_total_bytes(leaf, dev_item,
2654                                      btrfs_device_get_disk_total_bytes(device));
2655         btrfs_set_device_bytes_used(leaf, dev_item,
2656                                     btrfs_device_get_bytes_used(device));
2657         btrfs_mark_buffer_dirty(leaf);
2658
2659 out:
2660         btrfs_free_path(path);
2661         return ret;
2662 }
2663
2664 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2665                       struct btrfs_device *device, u64 new_size)
2666 {
2667         struct btrfs_fs_info *fs_info = device->fs_info;
2668         struct btrfs_super_block *super_copy = fs_info->super_copy;
2669         struct btrfs_fs_devices *fs_devices;
2670         u64 old_total;
2671         u64 diff;
2672
2673         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2674                 return -EACCES;
2675
2676         new_size = round_down(new_size, fs_info->sectorsize);
2677
2678         mutex_lock(&fs_info->chunk_mutex);
2679         old_total = btrfs_super_total_bytes(super_copy);
2680         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2681
2682         if (new_size <= device->total_bytes ||
2683             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2684                 mutex_unlock(&fs_info->chunk_mutex);
2685                 return -EINVAL;
2686         }
2687
2688         fs_devices = fs_info->fs_devices;
2689
2690         btrfs_set_super_total_bytes(super_copy,
2691                         round_down(old_total + diff, fs_info->sectorsize));
2692         device->fs_devices->total_rw_bytes += diff;
2693
2694         btrfs_device_set_total_bytes(device, new_size);
2695         btrfs_device_set_disk_total_bytes(device, new_size);
2696         btrfs_clear_space_info_full(device->fs_info);
2697         if (list_empty(&device->resized_list))
2698                 list_add_tail(&device->resized_list,
2699                               &fs_devices->resized_devices);
2700         mutex_unlock(&fs_info->chunk_mutex);
2701
2702         return btrfs_update_device(trans, device);
2703 }
2704
2705 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2706                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2707 {
2708         struct btrfs_root *root = fs_info->chunk_root;
2709         int ret;
2710         struct btrfs_path *path;
2711         struct btrfs_key key;
2712
2713         path = btrfs_alloc_path();
2714         if (!path)
2715                 return -ENOMEM;
2716
2717         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2718         key.offset = chunk_offset;
2719         key.type = BTRFS_CHUNK_ITEM_KEY;
2720
2721         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2722         if (ret < 0)
2723                 goto out;
2724         else if (ret > 0) { /* Logic error or corruption */
2725                 btrfs_handle_fs_error(fs_info, -ENOENT,
2726                                       "Failed lookup while freeing chunk.");
2727                 ret = -ENOENT;
2728                 goto out;
2729         }
2730
2731         ret = btrfs_del_item(trans, root, path);
2732         if (ret < 0)
2733                 btrfs_handle_fs_error(fs_info, ret,
2734                                       "Failed to delete chunk item.");
2735 out:
2736         btrfs_free_path(path);
2737         return ret;
2738 }
2739
2740 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2741 {
2742         struct btrfs_super_block *super_copy = fs_info->super_copy;
2743         struct btrfs_disk_key *disk_key;
2744         struct btrfs_chunk *chunk;
2745         u8 *ptr;
2746         int ret = 0;
2747         u32 num_stripes;
2748         u32 array_size;
2749         u32 len = 0;
2750         u32 cur;
2751         struct btrfs_key key;
2752
2753         mutex_lock(&fs_info->chunk_mutex);
2754         array_size = btrfs_super_sys_array_size(super_copy);
2755
2756         ptr = super_copy->sys_chunk_array;
2757         cur = 0;
2758
2759         while (cur < array_size) {
2760                 disk_key = (struct btrfs_disk_key *)ptr;
2761                 btrfs_disk_key_to_cpu(&key, disk_key);
2762
2763                 len = sizeof(*disk_key);
2764
2765                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2766                         chunk = (struct btrfs_chunk *)(ptr + len);
2767                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2768                         len += btrfs_chunk_item_size(num_stripes);
2769                 } else {
2770                         ret = -EIO;
2771                         break;
2772                 }
2773                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2774                     key.offset == chunk_offset) {
2775                         memmove(ptr, ptr + len, array_size - (cur + len));
2776                         array_size -= len;
2777                         btrfs_set_super_sys_array_size(super_copy, array_size);
2778                 } else {
2779                         ptr += len;
2780                         cur += len;
2781                 }
2782         }
2783         mutex_unlock(&fs_info->chunk_mutex);
2784         return ret;
2785 }
2786
2787 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2788                                         u64 logical, u64 length)
2789 {
2790         struct extent_map_tree *em_tree;
2791         struct extent_map *em;
2792
2793         em_tree = &fs_info->mapping_tree.map_tree;
2794         read_lock(&em_tree->lock);
2795         em = lookup_extent_mapping(em_tree, logical, length);
2796         read_unlock(&em_tree->lock);
2797
2798         if (!em) {
2799                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2800                            logical, length);
2801                 return ERR_PTR(-EINVAL);
2802         }
2803
2804         if (em->start > logical || em->start + em->len < logical) {
2805                 btrfs_crit(fs_info,
2806                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2807                            logical, length, em->start, em->start + em->len);
2808                 free_extent_map(em);
2809                 return ERR_PTR(-EINVAL);
2810         }
2811
2812         /* callers are responsible for dropping em's ref. */
2813         return em;
2814 }
2815
2816 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2817                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2818 {
2819         struct extent_map *em;
2820         struct map_lookup *map;
2821         u64 dev_extent_len = 0;
2822         int i, ret = 0;
2823         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2824
2825         em = get_chunk_map(fs_info, chunk_offset, 1);
2826         if (IS_ERR(em)) {
2827                 /*
2828                  * This is a logic error, but we don't want to just rely on the
2829                  * user having built with ASSERT enabled, so if ASSERT doesn't
2830                  * do anything we still error out.
2831                  */
2832                 ASSERT(0);
2833                 return PTR_ERR(em);
2834         }
2835         map = em->map_lookup;
2836         mutex_lock(&fs_info->chunk_mutex);
2837         check_system_chunk(trans, fs_info, map->type);
2838         mutex_unlock(&fs_info->chunk_mutex);
2839
2840         /*
2841          * Take the device list mutex to prevent races with the final phase of
2842          * a device replace operation that replaces the device object associated
2843          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2844          */
2845         mutex_lock(&fs_devices->device_list_mutex);
2846         for (i = 0; i < map->num_stripes; i++) {
2847                 struct btrfs_device *device = map->stripes[i].dev;
2848                 ret = btrfs_free_dev_extent(trans, device,
2849                                             map->stripes[i].physical,
2850                                             &dev_extent_len);
2851                 if (ret) {
2852                         mutex_unlock(&fs_devices->device_list_mutex);
2853                         btrfs_abort_transaction(trans, ret);
2854                         goto out;
2855                 }
2856
2857                 if (device->bytes_used > 0) {
2858                         mutex_lock(&fs_info->chunk_mutex);
2859                         btrfs_device_set_bytes_used(device,
2860                                         device->bytes_used - dev_extent_len);
2861                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2862                         btrfs_clear_space_info_full(fs_info);
2863                         mutex_unlock(&fs_info->chunk_mutex);
2864                 }
2865
2866                 if (map->stripes[i].dev) {
2867                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2868                         if (ret) {
2869                                 mutex_unlock(&fs_devices->device_list_mutex);
2870                                 btrfs_abort_transaction(trans, ret);
2871                                 goto out;
2872                         }
2873                 }
2874         }
2875         mutex_unlock(&fs_devices->device_list_mutex);
2876
2877         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2878         if (ret) {
2879                 btrfs_abort_transaction(trans, ret);
2880                 goto out;
2881         }
2882
2883         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2884
2885         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2886                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2887                 if (ret) {
2888                         btrfs_abort_transaction(trans, ret);
2889                         goto out;
2890                 }
2891         }
2892
2893         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2894         if (ret) {
2895                 btrfs_abort_transaction(trans, ret);
2896                 goto out;
2897         }
2898
2899 out:
2900         /* once for us */
2901         free_extent_map(em);
2902         return ret;
2903 }
2904
2905 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2906 {
2907         struct btrfs_root *root = fs_info->chunk_root;
2908         struct btrfs_trans_handle *trans;
2909         int ret;
2910
2911         /*
2912          * Prevent races with automatic removal of unused block groups.
2913          * After we relocate and before we remove the chunk with offset
2914          * chunk_offset, automatic removal of the block group can kick in,
2915          * resulting in a failure when calling btrfs_remove_chunk() below.
2916          *
2917          * Make sure to acquire this mutex before doing a tree search (dev
2918          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2919          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2920          * we release the path used to search the chunk/dev tree and before
2921          * the current task acquires this mutex and calls us.
2922          */
2923         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2924
2925         ret = btrfs_can_relocate(fs_info, chunk_offset);
2926         if (ret)
2927                 return -ENOSPC;
2928
2929         /* step one, relocate all the extents inside this chunk */
2930         btrfs_scrub_pause(fs_info);
2931         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2932         btrfs_scrub_continue(fs_info);
2933         if (ret)
2934                 return ret;
2935
2936         /*
2937          * We add the kobjects here (and after forcing data chunk creation)
2938          * since relocation is the only place we'll create chunks of a new
2939          * type at runtime.  The only place where we'll remove the last
2940          * chunk of a type is the call immediately below this one.  Even
2941          * so, we're protected against races with the cleaner thread since
2942          * we're covered by the delete_unused_bgs_mutex.
2943          */
2944         btrfs_add_raid_kobjects(fs_info);
2945
2946         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2947                                                      chunk_offset);
2948         if (IS_ERR(trans)) {
2949                 ret = PTR_ERR(trans);
2950                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2951                 return ret;
2952         }
2953
2954         /*
2955          * step two, delete the device extents and the
2956          * chunk tree entries
2957          */
2958         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2959         btrfs_end_transaction(trans);
2960         return ret;
2961 }
2962
2963 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2964 {
2965         struct btrfs_root *chunk_root = fs_info->chunk_root;
2966         struct btrfs_path *path;
2967         struct extent_buffer *leaf;
2968         struct btrfs_chunk *chunk;
2969         struct btrfs_key key;
2970         struct btrfs_key found_key;
2971         u64 chunk_type;
2972         bool retried = false;
2973         int failed = 0;
2974         int ret;
2975
2976         path = btrfs_alloc_path();
2977         if (!path)
2978                 return -ENOMEM;
2979
2980 again:
2981         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2982         key.offset = (u64)-1;
2983         key.type = BTRFS_CHUNK_ITEM_KEY;
2984
2985         while (1) {
2986                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2987                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2988                 if (ret < 0) {
2989                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2990                         goto error;
2991                 }
2992                 BUG_ON(ret == 0); /* Corruption */
2993
2994                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2995                                           key.type);
2996                 if (ret)
2997                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2998                 if (ret < 0)
2999                         goto error;
3000                 if (ret > 0)
3001                         break;
3002
3003                 leaf = path->nodes[0];
3004                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3005
3006                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3007                                        struct btrfs_chunk);
3008                 chunk_type = btrfs_chunk_type(leaf, chunk);
3009                 btrfs_release_path(path);
3010
3011                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3012                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3013                         if (ret == -ENOSPC)
3014                                 failed++;
3015                         else
3016                                 BUG_ON(ret);
3017                 }
3018                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3019
3020                 if (found_key.offset == 0)
3021                         break;
3022                 key.offset = found_key.offset - 1;
3023         }
3024         ret = 0;
3025         if (failed && !retried) {
3026                 failed = 0;
3027                 retried = true;
3028                 goto again;
3029         } else if (WARN_ON(failed && retried)) {
3030                 ret = -ENOSPC;
3031         }
3032 error:
3033         btrfs_free_path(path);
3034         return ret;
3035 }
3036
3037 /*
3038  * return 1 : allocate a data chunk successfully,
3039  * return <0: errors during allocating a data chunk,
3040  * return 0 : no need to allocate a data chunk.
3041  */
3042 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3043                                       u64 chunk_offset)
3044 {
3045         struct btrfs_block_group_cache *cache;
3046         u64 bytes_used;
3047         u64 chunk_type;
3048
3049         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3050         ASSERT(cache);
3051         chunk_type = cache->flags;
3052         btrfs_put_block_group(cache);
3053
3054         if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3055                 spin_lock(&fs_info->data_sinfo->lock);
3056                 bytes_used = fs_info->data_sinfo->bytes_used;
3057                 spin_unlock(&fs_info->data_sinfo->lock);
3058
3059                 if (!bytes_used) {
3060                         struct btrfs_trans_handle *trans;
3061                         int ret;
3062
3063                         trans = btrfs_join_transaction(fs_info->tree_root);
3064                         if (IS_ERR(trans))
3065                                 return PTR_ERR(trans);
3066
3067                         ret = btrfs_force_chunk_alloc(trans, fs_info,
3068                                                       BTRFS_BLOCK_GROUP_DATA);
3069                         btrfs_end_transaction(trans);
3070                         if (ret < 0)
3071                                 return ret;
3072
3073                         btrfs_add_raid_kobjects(fs_info);
3074
3075                         return 1;
3076                 }
3077         }
3078         return 0;
3079 }
3080
3081 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3082                                struct btrfs_balance_control *bctl)
3083 {
3084         struct btrfs_root *root = fs_info->tree_root;
3085         struct btrfs_trans_handle *trans;
3086         struct btrfs_balance_item *item;
3087         struct btrfs_disk_balance_args disk_bargs;
3088         struct btrfs_path *path;
3089         struct extent_buffer *leaf;
3090         struct btrfs_key key;
3091         int ret, err;
3092
3093         path = btrfs_alloc_path();
3094         if (!path)
3095                 return -ENOMEM;
3096
3097         trans = btrfs_start_transaction(root, 0);
3098         if (IS_ERR(trans)) {
3099                 btrfs_free_path(path);
3100                 return PTR_ERR(trans);
3101         }
3102
3103         key.objectid = BTRFS_BALANCE_OBJECTID;
3104         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3105         key.offset = 0;
3106
3107         ret = btrfs_insert_empty_item(trans, root, path, &key,
3108                                       sizeof(*item));
3109         if (ret)
3110                 goto out;
3111
3112         leaf = path->nodes[0];
3113         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3114
3115         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3116
3117         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3118         btrfs_set_balance_data(leaf, item, &disk_bargs);
3119         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3120         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3121         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3122         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3123
3124         btrfs_set_balance_flags(leaf, item, bctl->flags);
3125
3126         btrfs_mark_buffer_dirty(leaf);
3127 out:
3128         btrfs_free_path(path);
3129         err = btrfs_commit_transaction(trans);
3130         if (err && !ret)
3131                 ret = err;
3132         return ret;
3133 }
3134
3135 static int del_balance_item(struct btrfs_fs_info *fs_info)
3136 {
3137         struct btrfs_root *root = fs_info->tree_root;
3138         struct btrfs_trans_handle *trans;
3139         struct btrfs_path *path;
3140         struct btrfs_key key;
3141         int ret, err;
3142
3143         path = btrfs_alloc_path();
3144         if (!path)
3145                 return -ENOMEM;
3146
3147         trans = btrfs_start_transaction(root, 0);
3148         if (IS_ERR(trans)) {
3149                 btrfs_free_path(path);
3150                 return PTR_ERR(trans);
3151         }
3152
3153         key.objectid = BTRFS_BALANCE_OBJECTID;
3154         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3155         key.offset = 0;
3156
3157         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3158         if (ret < 0)
3159                 goto out;
3160         if (ret > 0) {
3161                 ret = -ENOENT;
3162                 goto out;
3163         }
3164
3165         ret = btrfs_del_item(trans, root, path);
3166 out:
3167         btrfs_free_path(path);
3168         err = btrfs_commit_transaction(trans);
3169         if (err && !ret)
3170                 ret = err;
3171         return ret;
3172 }
3173
3174 /*
3175  * This is a heuristic used to reduce the number of chunks balanced on
3176  * resume after balance was interrupted.
3177  */
3178 static void update_balance_args(struct btrfs_balance_control *bctl)
3179 {
3180         /*
3181          * Turn on soft mode for chunk types that were being converted.
3182          */
3183         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3184                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3185         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3186                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3187         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3188                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3189
3190         /*
3191          * Turn on usage filter if is not already used.  The idea is
3192          * that chunks that we have already balanced should be
3193          * reasonably full.  Don't do it for chunks that are being
3194          * converted - that will keep us from relocating unconverted
3195          * (albeit full) chunks.
3196          */
3197         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3198             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3199             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3200                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3201                 bctl->data.usage = 90;
3202         }
3203         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3204             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3205             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3206                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3207                 bctl->sys.usage = 90;
3208         }
3209         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3210             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3211             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3212                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3213                 bctl->meta.usage = 90;
3214         }
3215 }
3216
3217 /*
3218  * Clear the balance status in fs_info and delete the balance item from disk.
3219  */
3220 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3221 {
3222         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3223         int ret;
3224
3225         BUG_ON(!fs_info->balance_ctl);
3226
3227         spin_lock(&fs_info->balance_lock);
3228         fs_info->balance_ctl = NULL;
3229         spin_unlock(&fs_info->balance_lock);
3230
3231         kfree(bctl);
3232         ret = del_balance_item(fs_info);
3233         if (ret)
3234                 btrfs_handle_fs_error(fs_info, ret, NULL);
3235 }
3236
3237 /*
3238  * Balance filters.  Return 1 if chunk should be filtered out
3239  * (should not be balanced).
3240  */
3241 static int chunk_profiles_filter(u64 chunk_type,
3242                                  struct btrfs_balance_args *bargs)
3243 {
3244         chunk_type = chunk_to_extended(chunk_type) &
3245                                 BTRFS_EXTENDED_PROFILE_MASK;
3246
3247         if (bargs->profiles & chunk_type)