Merge tag 'for-4.15-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/iocontext.h>
24 #include <linux/capability.h>
25 #include <linux/ratelimit.h>
26 #include <linux/kthread.h>
27 #include <linux/raid/pq.h>
28 #include <linux/semaphore.h>
29 #include <linux/uuid.h>
30 #include <asm/div64.h>
31 #include "ctree.h"
32 #include "extent_map.h"
33 #include "disk-io.h"
34 #include "transaction.h"
35 #include "print-tree.h"
36 #include "volumes.h"
37 #include "raid56.h"
38 #include "async-thread.h"
39 #include "check-integrity.h"
40 #include "rcu-string.h"
41 #include "math.h"
42 #include "dev-replace.h"
43 #include "sysfs.h"
44
45 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
46         [BTRFS_RAID_RAID10] = {
47                 .sub_stripes    = 2,
48                 .dev_stripes    = 1,
49                 .devs_max       = 0,    /* 0 == as many as possible */
50                 .devs_min       = 4,
51                 .tolerated_failures = 1,
52                 .devs_increment = 2,
53                 .ncopies        = 2,
54         },
55         [BTRFS_RAID_RAID1] = {
56                 .sub_stripes    = 1,
57                 .dev_stripes    = 1,
58                 .devs_max       = 2,
59                 .devs_min       = 2,
60                 .tolerated_failures = 1,
61                 .devs_increment = 2,
62                 .ncopies        = 2,
63         },
64         [BTRFS_RAID_DUP] = {
65                 .sub_stripes    = 1,
66                 .dev_stripes    = 2,
67                 .devs_max       = 1,
68                 .devs_min       = 1,
69                 .tolerated_failures = 0,
70                 .devs_increment = 1,
71                 .ncopies        = 2,
72         },
73         [BTRFS_RAID_RAID0] = {
74                 .sub_stripes    = 1,
75                 .dev_stripes    = 1,
76                 .devs_max       = 0,
77                 .devs_min       = 2,
78                 .tolerated_failures = 0,
79                 .devs_increment = 1,
80                 .ncopies        = 1,
81         },
82         [BTRFS_RAID_SINGLE] = {
83                 .sub_stripes    = 1,
84                 .dev_stripes    = 1,
85                 .devs_max       = 1,
86                 .devs_min       = 1,
87                 .tolerated_failures = 0,
88                 .devs_increment = 1,
89                 .ncopies        = 1,
90         },
91         [BTRFS_RAID_RAID5] = {
92                 .sub_stripes    = 1,
93                 .dev_stripes    = 1,
94                 .devs_max       = 0,
95                 .devs_min       = 2,
96                 .tolerated_failures = 1,
97                 .devs_increment = 1,
98                 .ncopies        = 2,
99         },
100         [BTRFS_RAID_RAID6] = {
101                 .sub_stripes    = 1,
102                 .dev_stripes    = 1,
103                 .devs_max       = 0,
104                 .devs_min       = 3,
105                 .tolerated_failures = 2,
106                 .devs_increment = 1,
107                 .ncopies        = 3,
108         },
109 };
110
111 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
112         [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
113         [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
114         [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
115         [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
116         [BTRFS_RAID_SINGLE] = 0,
117         [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
118         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
119 };
120
121 /*
122  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
123  * condition is not met. Zero means there's no corresponding
124  * BTRFS_ERROR_DEV_*_NOT_MET value.
125  */
126 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
127         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
128         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
129         [BTRFS_RAID_DUP]    = 0,
130         [BTRFS_RAID_RAID0]  = 0,
131         [BTRFS_RAID_SINGLE] = 0,
132         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
133         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
134 };
135
136 static int init_first_rw_device(struct btrfs_trans_handle *trans,
137                                 struct btrfs_fs_info *fs_info);
138 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
139 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
140 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
141 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
142 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
143                              enum btrfs_map_op op,
144                              u64 logical, u64 *length,
145                              struct btrfs_bio **bbio_ret,
146                              int mirror_num, int need_raid_map);
147
148 DEFINE_MUTEX(uuid_mutex);
149 static LIST_HEAD(fs_uuids);
150 struct list_head *btrfs_get_fs_uuids(void)
151 {
152         return &fs_uuids;
153 }
154
155 /*
156  * alloc_fs_devices - allocate struct btrfs_fs_devices
157  * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
158  *
159  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
160  * The returned struct is not linked onto any lists and can be destroyed with
161  * kfree() right away.
162  */
163 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
164 {
165         struct btrfs_fs_devices *fs_devs;
166
167         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
168         if (!fs_devs)
169                 return ERR_PTR(-ENOMEM);
170
171         mutex_init(&fs_devs->device_list_mutex);
172
173         INIT_LIST_HEAD(&fs_devs->devices);
174         INIT_LIST_HEAD(&fs_devs->resized_devices);
175         INIT_LIST_HEAD(&fs_devs->alloc_list);
176         INIT_LIST_HEAD(&fs_devs->list);
177         if (fsid)
178                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
179
180         return fs_devs;
181 }
182
183 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
184 {
185         struct btrfs_device *device;
186         WARN_ON(fs_devices->opened);
187         while (!list_empty(&fs_devices->devices)) {
188                 device = list_entry(fs_devices->devices.next,
189                                     struct btrfs_device, dev_list);
190                 list_del(&device->dev_list);
191                 rcu_string_free(device->name);
192                 bio_put(device->flush_bio);
193                 kfree(device);
194         }
195         kfree(fs_devices);
196 }
197
198 static void btrfs_kobject_uevent(struct block_device *bdev,
199                                  enum kobject_action action)
200 {
201         int ret;
202
203         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
204         if (ret)
205                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
206                         action,
207                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
208                         &disk_to_dev(bdev->bd_disk)->kobj);
209 }
210
211 void btrfs_cleanup_fs_uuids(void)
212 {
213         struct btrfs_fs_devices *fs_devices;
214
215         while (!list_empty(&fs_uuids)) {
216                 fs_devices = list_entry(fs_uuids.next,
217                                         struct btrfs_fs_devices, list);
218                 list_del(&fs_devices->list);
219                 free_fs_devices(fs_devices);
220         }
221 }
222
223 static struct btrfs_device *__alloc_device(void)
224 {
225         struct btrfs_device *dev;
226
227         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
228         if (!dev)
229                 return ERR_PTR(-ENOMEM);
230
231         /*
232          * Preallocate a bio that's always going to be used for flushing device
233          * barriers and matches the device lifespan
234          */
235         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
236         if (!dev->flush_bio) {
237                 kfree(dev);
238                 return ERR_PTR(-ENOMEM);
239         }
240
241         INIT_LIST_HEAD(&dev->dev_list);
242         INIT_LIST_HEAD(&dev->dev_alloc_list);
243         INIT_LIST_HEAD(&dev->resized_list);
244
245         spin_lock_init(&dev->io_lock);
246
247         spin_lock_init(&dev->reada_lock);
248         atomic_set(&dev->reada_in_flight, 0);
249         atomic_set(&dev->dev_stats_ccnt, 0);
250         btrfs_device_data_ordered_init(dev);
251         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
252         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
253
254         return dev;
255 }
256
257 /*
258  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
259  * return NULL.
260  *
261  * If devid and uuid are both specified, the match must be exact, otherwise
262  * only devid is used.
263  */
264 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
265                 u64 devid, const u8 *uuid)
266 {
267         struct list_head *head = &fs_devices->devices;
268         struct btrfs_device *dev;
269
270         list_for_each_entry(dev, head, dev_list) {
271                 if (dev->devid == devid &&
272                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
273                         return dev;
274                 }
275         }
276         return NULL;
277 }
278
279 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
280 {
281         struct btrfs_fs_devices *fs_devices;
282
283         list_for_each_entry(fs_devices, &fs_uuids, list) {
284                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
285                         return fs_devices;
286         }
287         return NULL;
288 }
289
290 static int
291 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
292                       int flush, struct block_device **bdev,
293                       struct buffer_head **bh)
294 {
295         int ret;
296
297         *bdev = blkdev_get_by_path(device_path, flags, holder);
298
299         if (IS_ERR(*bdev)) {
300                 ret = PTR_ERR(*bdev);
301                 goto error;
302         }
303
304         if (flush)
305                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
306         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
307         if (ret) {
308                 blkdev_put(*bdev, flags);
309                 goto error;
310         }
311         invalidate_bdev(*bdev);
312         *bh = btrfs_read_dev_super(*bdev);
313         if (IS_ERR(*bh)) {
314                 ret = PTR_ERR(*bh);
315                 blkdev_put(*bdev, flags);
316                 goto error;
317         }
318
319         return 0;
320
321 error:
322         *bdev = NULL;
323         *bh = NULL;
324         return ret;
325 }
326
327 static void requeue_list(struct btrfs_pending_bios *pending_bios,
328                         struct bio *head, struct bio *tail)
329 {
330
331         struct bio *old_head;
332
333         old_head = pending_bios->head;
334         pending_bios->head = head;
335         if (pending_bios->tail)
336                 tail->bi_next = old_head;
337         else
338                 pending_bios->tail = tail;
339 }
340
341 /*
342  * we try to collect pending bios for a device so we don't get a large
343  * number of procs sending bios down to the same device.  This greatly
344  * improves the schedulers ability to collect and merge the bios.
345  *
346  * But, it also turns into a long list of bios to process and that is sure
347  * to eventually make the worker thread block.  The solution here is to
348  * make some progress and then put this work struct back at the end of
349  * the list if the block device is congested.  This way, multiple devices
350  * can make progress from a single worker thread.
351  */
352 static noinline void run_scheduled_bios(struct btrfs_device *device)
353 {
354         struct btrfs_fs_info *fs_info = device->fs_info;
355         struct bio *pending;
356         struct backing_dev_info *bdi;
357         struct btrfs_pending_bios *pending_bios;
358         struct bio *tail;
359         struct bio *cur;
360         int again = 0;
361         unsigned long num_run;
362         unsigned long batch_run = 0;
363         unsigned long last_waited = 0;
364         int force_reg = 0;
365         int sync_pending = 0;
366         struct blk_plug plug;
367
368         /*
369          * this function runs all the bios we've collected for
370          * a particular device.  We don't want to wander off to
371          * another device without first sending all of these down.
372          * So, setup a plug here and finish it off before we return
373          */
374         blk_start_plug(&plug);
375
376         bdi = device->bdev->bd_bdi;
377
378 loop:
379         spin_lock(&device->io_lock);
380
381 loop_lock:
382         num_run = 0;
383
384         /* take all the bios off the list at once and process them
385          * later on (without the lock held).  But, remember the
386          * tail and other pointers so the bios can be properly reinserted
387          * into the list if we hit congestion
388          */
389         if (!force_reg && device->pending_sync_bios.head) {
390                 pending_bios = &device->pending_sync_bios;
391                 force_reg = 1;
392         } else {
393                 pending_bios = &device->pending_bios;
394                 force_reg = 0;
395         }
396
397         pending = pending_bios->head;
398         tail = pending_bios->tail;
399         WARN_ON(pending && !tail);
400
401         /*
402          * if pending was null this time around, no bios need processing
403          * at all and we can stop.  Otherwise it'll loop back up again
404          * and do an additional check so no bios are missed.
405          *
406          * device->running_pending is used to synchronize with the
407          * schedule_bio code.
408          */
409         if (device->pending_sync_bios.head == NULL &&
410             device->pending_bios.head == NULL) {
411                 again = 0;
412                 device->running_pending = 0;
413         } else {
414                 again = 1;
415                 device->running_pending = 1;
416         }
417
418         pending_bios->head = NULL;
419         pending_bios->tail = NULL;
420
421         spin_unlock(&device->io_lock);
422
423         while (pending) {
424
425                 rmb();
426                 /* we want to work on both lists, but do more bios on the
427                  * sync list than the regular list
428                  */
429                 if ((num_run > 32 &&
430                     pending_bios != &device->pending_sync_bios &&
431                     device->pending_sync_bios.head) ||
432                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
433                     device->pending_bios.head)) {
434                         spin_lock(&device->io_lock);
435                         requeue_list(pending_bios, pending, tail);
436                         goto loop_lock;
437                 }
438
439                 cur = pending;
440                 pending = pending->bi_next;
441                 cur->bi_next = NULL;
442
443                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
444
445                 /*
446                  * if we're doing the sync list, record that our
447                  * plug has some sync requests on it
448                  *
449                  * If we're doing the regular list and there are
450                  * sync requests sitting around, unplug before
451                  * we add more
452                  */
453                 if (pending_bios == &device->pending_sync_bios) {
454                         sync_pending = 1;
455                 } else if (sync_pending) {
456                         blk_finish_plug(&plug);
457                         blk_start_plug(&plug);
458                         sync_pending = 0;
459                 }
460
461                 btrfsic_submit_bio(cur);
462                 num_run++;
463                 batch_run++;
464
465                 cond_resched();
466
467                 /*
468                  * we made progress, there is more work to do and the bdi
469                  * is now congested.  Back off and let other work structs
470                  * run instead
471                  */
472                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
473                     fs_info->fs_devices->open_devices > 1) {
474                         struct io_context *ioc;
475
476                         ioc = current->io_context;
477
478                         /*
479                          * the main goal here is that we don't want to
480                          * block if we're going to be able to submit
481                          * more requests without blocking.
482                          *
483                          * This code does two great things, it pokes into
484                          * the elevator code from a filesystem _and_
485                          * it makes assumptions about how batching works.
486                          */
487                         if (ioc && ioc->nr_batch_requests > 0 &&
488                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
489                             (last_waited == 0 ||
490                              ioc->last_waited == last_waited)) {
491                                 /*
492                                  * we want to go through our batch of
493                                  * requests and stop.  So, we copy out
494                                  * the ioc->last_waited time and test
495                                  * against it before looping
496                                  */
497                                 last_waited = ioc->last_waited;
498                                 cond_resched();
499                                 continue;
500                         }
501                         spin_lock(&device->io_lock);
502                         requeue_list(pending_bios, pending, tail);
503                         device->running_pending = 1;
504
505                         spin_unlock(&device->io_lock);
506                         btrfs_queue_work(fs_info->submit_workers,
507                                          &device->work);
508                         goto done;
509                 }
510         }
511
512         cond_resched();
513         if (again)
514                 goto loop;
515
516         spin_lock(&device->io_lock);
517         if (device->pending_bios.head || device->pending_sync_bios.head)
518                 goto loop_lock;
519         spin_unlock(&device->io_lock);
520
521 done:
522         blk_finish_plug(&plug);
523 }
524
525 static void pending_bios_fn(struct btrfs_work *work)
526 {
527         struct btrfs_device *device;
528
529         device = container_of(work, struct btrfs_device, work);
530         run_scheduled_bios(device);
531 }
532
533
534 static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
535 {
536         struct btrfs_fs_devices *fs_devs;
537         struct btrfs_device *dev;
538
539         if (!cur_dev->name)
540                 return;
541
542         list_for_each_entry(fs_devs, &fs_uuids, list) {
543                 int del = 1;
544
545                 if (fs_devs->opened)
546                         continue;
547                 if (fs_devs->seeding)
548                         continue;
549
550                 list_for_each_entry(dev, &fs_devs->devices, dev_list) {
551
552                         if (dev == cur_dev)
553                                 continue;
554                         if (!dev->name)
555                                 continue;
556
557                         /*
558                          * Todo: This won't be enough. What if the same device
559                          * comes back (with new uuid and) with its mapper path?
560                          * But for now, this does help as mostly an admin will
561                          * either use mapper or non mapper path throughout.
562                          */
563                         rcu_read_lock();
564                         del = strcmp(rcu_str_deref(dev->name),
565                                                 rcu_str_deref(cur_dev->name));
566                         rcu_read_unlock();
567                         if (!del)
568                                 break;
569                 }
570
571                 if (!del) {
572                         /* delete the stale device */
573                         if (fs_devs->num_devices == 1) {
574                                 btrfs_sysfs_remove_fsid(fs_devs);
575                                 list_del(&fs_devs->list);
576                                 free_fs_devices(fs_devs);
577                         } else {
578                                 fs_devs->num_devices--;
579                                 list_del(&dev->dev_list);
580                                 rcu_string_free(dev->name);
581                                 bio_put(dev->flush_bio);
582                                 kfree(dev);
583                         }
584                         break;
585                 }
586         }
587 }
588
589 /*
590  * Add new device to list of registered devices
591  *
592  * Returns:
593  * 1   - first time device is seen
594  * 0   - device already known
595  * < 0 - error
596  */
597 static noinline int device_list_add(const char *path,
598                            struct btrfs_super_block *disk_super,
599                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
600 {
601         struct btrfs_device *device;
602         struct btrfs_fs_devices *fs_devices;
603         struct rcu_string *name;
604         int ret = 0;
605         u64 found_transid = btrfs_super_generation(disk_super);
606
607         fs_devices = find_fsid(disk_super->fsid);
608         if (!fs_devices) {
609                 fs_devices = alloc_fs_devices(disk_super->fsid);
610                 if (IS_ERR(fs_devices))
611                         return PTR_ERR(fs_devices);
612
613                 list_add(&fs_devices->list, &fs_uuids);
614
615                 device = NULL;
616         } else {
617                 device = find_device(fs_devices, devid,
618                                 disk_super->dev_item.uuid);
619         }
620
621         if (!device) {
622                 if (fs_devices->opened)
623                         return -EBUSY;
624
625                 device = btrfs_alloc_device(NULL, &devid,
626                                             disk_super->dev_item.uuid);
627                 if (IS_ERR(device)) {
628                         /* we can safely leave the fs_devices entry around */
629                         return PTR_ERR(device);
630                 }
631
632                 name = rcu_string_strdup(path, GFP_NOFS);
633                 if (!name) {
634                         bio_put(device->flush_bio);
635                         kfree(device);
636                         return -ENOMEM;
637                 }
638                 rcu_assign_pointer(device->name, name);
639
640                 mutex_lock(&fs_devices->device_list_mutex);
641                 list_add_rcu(&device->dev_list, &fs_devices->devices);
642                 fs_devices->num_devices++;
643                 mutex_unlock(&fs_devices->device_list_mutex);
644
645                 ret = 1;
646                 device->fs_devices = fs_devices;
647         } else if (!device->name || strcmp(device->name->str, path)) {
648                 /*
649                  * When FS is already mounted.
650                  * 1. If you are here and if the device->name is NULL that
651                  *    means this device was missing at time of FS mount.
652                  * 2. If you are here and if the device->name is different
653                  *    from 'path' that means either
654                  *      a. The same device disappeared and reappeared with
655                  *         different name. or
656                  *      b. The missing-disk-which-was-replaced, has
657                  *         reappeared now.
658                  *
659                  * We must allow 1 and 2a above. But 2b would be a spurious
660                  * and unintentional.
661                  *
662                  * Further in case of 1 and 2a above, the disk at 'path'
663                  * would have missed some transaction when it was away and
664                  * in case of 2a the stale bdev has to be updated as well.
665                  * 2b must not be allowed at all time.
666                  */
667
668                 /*
669                  * For now, we do allow update to btrfs_fs_device through the
670                  * btrfs dev scan cli after FS has been mounted.  We're still
671                  * tracking a problem where systems fail mount by subvolume id
672                  * when we reject replacement on a mounted FS.
673                  */
674                 if (!fs_devices->opened && found_transid < device->generation) {
675                         /*
676                          * That is if the FS is _not_ mounted and if you
677                          * are here, that means there is more than one
678                          * disk with same uuid and devid.We keep the one
679                          * with larger generation number or the last-in if
680                          * generation are equal.
681                          */
682                         return -EEXIST;
683                 }
684
685                 name = rcu_string_strdup(path, GFP_NOFS);
686                 if (!name)
687                         return -ENOMEM;
688                 rcu_string_free(device->name);
689                 rcu_assign_pointer(device->name, name);
690                 if (device->missing) {
691                         fs_devices->missing_devices--;
692                         device->missing = 0;
693                 }
694         }
695
696         /*
697          * Unmount does not free the btrfs_device struct but would zero
698          * generation along with most of the other members. So just update
699          * it back. We need it to pick the disk with largest generation
700          * (as above).
701          */
702         if (!fs_devices->opened)
703                 device->generation = found_transid;
704
705         /*
706          * if there is new btrfs on an already registered device,
707          * then remove the stale device entry.
708          */
709         if (ret > 0)
710                 btrfs_free_stale_device(device);
711
712         *fs_devices_ret = fs_devices;
713
714         return ret;
715 }
716
717 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
718 {
719         struct btrfs_fs_devices *fs_devices;
720         struct btrfs_device *device;
721         struct btrfs_device *orig_dev;
722
723         fs_devices = alloc_fs_devices(orig->fsid);
724         if (IS_ERR(fs_devices))
725                 return fs_devices;
726
727         mutex_lock(&orig->device_list_mutex);
728         fs_devices->total_devices = orig->total_devices;
729
730         /* We have held the volume lock, it is safe to get the devices. */
731         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
732                 struct rcu_string *name;
733
734                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
735                                             orig_dev->uuid);
736                 if (IS_ERR(device))
737                         goto error;
738
739                 /*
740                  * This is ok to do without rcu read locked because we hold the
741                  * uuid mutex so nothing we touch in here is going to disappear.
742                  */
743                 if (orig_dev->name) {
744                         name = rcu_string_strdup(orig_dev->name->str,
745                                         GFP_KERNEL);
746                         if (!name) {
747                                 bio_put(device->flush_bio);
748                                 kfree(device);
749                                 goto error;
750                         }
751                         rcu_assign_pointer(device->name, name);
752                 }
753
754                 list_add(&device->dev_list, &fs_devices->devices);
755                 device->fs_devices = fs_devices;
756                 fs_devices->num_devices++;
757         }
758         mutex_unlock(&orig->device_list_mutex);
759         return fs_devices;
760 error:
761         mutex_unlock(&orig->device_list_mutex);
762         free_fs_devices(fs_devices);
763         return ERR_PTR(-ENOMEM);
764 }
765
766 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
767 {
768         struct btrfs_device *device, *next;
769         struct btrfs_device *latest_dev = NULL;
770
771         mutex_lock(&uuid_mutex);
772 again:
773         /* This is the initialized path, it is safe to release the devices. */
774         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
775                 if (device->in_fs_metadata) {
776                         if (!device->is_tgtdev_for_dev_replace &&
777                             (!latest_dev ||
778                              device->generation > latest_dev->generation)) {
779                                 latest_dev = device;
780                         }
781                         continue;
782                 }
783
784                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
785                         /*
786                          * In the first step, keep the device which has
787                          * the correct fsid and the devid that is used
788                          * for the dev_replace procedure.
789                          * In the second step, the dev_replace state is
790                          * read from the device tree and it is known
791                          * whether the procedure is really active or
792                          * not, which means whether this device is
793                          * used or whether it should be removed.
794                          */
795                         if (step == 0 || device->is_tgtdev_for_dev_replace) {
796                                 continue;
797                         }
798                 }
799                 if (device->bdev) {
800                         blkdev_put(device->bdev, device->mode);
801                         device->bdev = NULL;
802                         fs_devices->open_devices--;
803                 }
804                 if (device->writeable) {
805                         list_del_init(&device->dev_alloc_list);
806                         device->writeable = 0;
807                         if (!device->is_tgtdev_for_dev_replace)
808                                 fs_devices->rw_devices--;
809                 }
810                 list_del_init(&device->dev_list);
811                 fs_devices->num_devices--;
812                 rcu_string_free(device->name);
813                 bio_put(device->flush_bio);
814                 kfree(device);
815         }
816
817         if (fs_devices->seed) {
818                 fs_devices = fs_devices->seed;
819                 goto again;
820         }
821
822         fs_devices->latest_bdev = latest_dev->bdev;
823
824         mutex_unlock(&uuid_mutex);
825 }
826
827 static void __free_device(struct work_struct *work)
828 {
829         struct btrfs_device *device;
830
831         device = container_of(work, struct btrfs_device, rcu_work);
832         rcu_string_free(device->name);
833         bio_put(device->flush_bio);
834         kfree(device);
835 }
836
837 static void free_device(struct rcu_head *head)
838 {
839         struct btrfs_device *device;
840
841         device = container_of(head, struct btrfs_device, rcu);
842
843         INIT_WORK(&device->rcu_work, __free_device);
844         schedule_work(&device->rcu_work);
845 }
846
847 static void btrfs_close_bdev(struct btrfs_device *device)
848 {
849         if (device->bdev && device->writeable) {
850                 sync_blockdev(device->bdev);
851                 invalidate_bdev(device->bdev);
852         }
853
854         if (device->bdev)
855                 blkdev_put(device->bdev, device->mode);
856 }
857
858 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
859 {
860         struct btrfs_fs_devices *fs_devices = device->fs_devices;
861         struct btrfs_device *new_device;
862         struct rcu_string *name;
863
864         if (device->bdev)
865                 fs_devices->open_devices--;
866
867         if (device->writeable &&
868             device->devid != BTRFS_DEV_REPLACE_DEVID) {
869                 list_del_init(&device->dev_alloc_list);
870                 fs_devices->rw_devices--;
871         }
872
873         if (device->missing)
874                 fs_devices->missing_devices--;
875
876         new_device = btrfs_alloc_device(NULL, &device->devid,
877                                         device->uuid);
878         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
879
880         /* Safe because we are under uuid_mutex */
881         if (device->name) {
882                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
883                 BUG_ON(!name); /* -ENOMEM */
884                 rcu_assign_pointer(new_device->name, name);
885         }
886
887         list_replace_rcu(&device->dev_list, &new_device->dev_list);
888         new_device->fs_devices = device->fs_devices;
889 }
890
891 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
892 {
893         struct btrfs_device *device, *tmp;
894         struct list_head pending_put;
895
896         INIT_LIST_HEAD(&pending_put);
897
898         if (--fs_devices->opened > 0)
899                 return 0;
900
901         mutex_lock(&fs_devices->device_list_mutex);
902         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
903                 btrfs_prepare_close_one_device(device);
904                 list_add(&device->dev_list, &pending_put);
905         }
906         mutex_unlock(&fs_devices->device_list_mutex);
907
908         /*
909          * btrfs_show_devname() is using the device_list_mutex,
910          * sometimes call to blkdev_put() leads vfs calling
911          * into this func. So do put outside of device_list_mutex,
912          * as of now.
913          */
914         while (!list_empty(&pending_put)) {
915                 device = list_first_entry(&pending_put,
916                                 struct btrfs_device, dev_list);
917                 list_del(&device->dev_list);
918                 btrfs_close_bdev(device);
919                 call_rcu(&device->rcu, free_device);
920         }
921
922         WARN_ON(fs_devices->open_devices);
923         WARN_ON(fs_devices->rw_devices);
924         fs_devices->opened = 0;
925         fs_devices->seeding = 0;
926
927         return 0;
928 }
929
930 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
931 {
932         struct btrfs_fs_devices *seed_devices = NULL;
933         int ret;
934
935         mutex_lock(&uuid_mutex);
936         ret = __btrfs_close_devices(fs_devices);
937         if (!fs_devices->opened) {
938                 seed_devices = fs_devices->seed;
939                 fs_devices->seed = NULL;
940         }
941         mutex_unlock(&uuid_mutex);
942
943         while (seed_devices) {
944                 fs_devices = seed_devices;
945                 seed_devices = fs_devices->seed;
946                 __btrfs_close_devices(fs_devices);
947                 free_fs_devices(fs_devices);
948         }
949         /*
950          * Wait for rcu kworkers under __btrfs_close_devices
951          * to finish all blkdev_puts so device is really
952          * free when umount is done.
953          */
954         rcu_barrier();
955         return ret;
956 }
957
958 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
959                                 fmode_t flags, void *holder)
960 {
961         struct request_queue *q;
962         struct block_device *bdev;
963         struct list_head *head = &fs_devices->devices;
964         struct btrfs_device *device;
965         struct btrfs_device *latest_dev = NULL;
966         struct buffer_head *bh;
967         struct btrfs_super_block *disk_super;
968         u64 devid;
969         int seeding = 1;
970         int ret = 0;
971
972         flags |= FMODE_EXCL;
973
974         list_for_each_entry(device, head, dev_list) {
975                 if (device->bdev)
976                         continue;
977                 if (!device->name)
978                         continue;
979
980                 /* Just open everything we can; ignore failures here */
981                 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
982                                             &bdev, &bh))
983                         continue;
984
985                 disk_super = (struct btrfs_super_block *)bh->b_data;
986                 devid = btrfs_stack_device_id(&disk_super->dev_item);
987                 if (devid != device->devid)
988                         goto error_brelse;
989
990                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
991                            BTRFS_UUID_SIZE))
992                         goto error_brelse;
993
994                 device->generation = btrfs_super_generation(disk_super);
995                 if (!latest_dev ||
996                     device->generation > latest_dev->generation)
997                         latest_dev = device;
998
999                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
1000                         device->writeable = 0;
1001                 } else {
1002                         device->writeable = !bdev_read_only(bdev);
1003                         seeding = 0;
1004                 }
1005
1006                 q = bdev_get_queue(bdev);
1007                 if (blk_queue_discard(q))
1008                         device->can_discard = 1;
1009                 if (!blk_queue_nonrot(q))
1010                         fs_devices->rotating = 1;
1011
1012                 device->bdev = bdev;
1013                 device->in_fs_metadata = 0;
1014                 device->mode = flags;
1015
1016                 fs_devices->open_devices++;
1017                 if (device->writeable &&
1018                     device->devid != BTRFS_DEV_REPLACE_DEVID) {
1019                         fs_devices->rw_devices++;
1020                         list_add(&device->dev_alloc_list,
1021                                  &fs_devices->alloc_list);
1022                 }
1023                 brelse(bh);
1024                 continue;
1025
1026 error_brelse:
1027                 brelse(bh);
1028                 blkdev_put(bdev, flags);
1029                 continue;
1030         }
1031         if (fs_devices->open_devices == 0) {
1032                 ret = -EINVAL;
1033                 goto out;
1034         }
1035         fs_devices->seeding = seeding;
1036         fs_devices->opened = 1;
1037         fs_devices->latest_bdev = latest_dev->bdev;
1038         fs_devices->total_rw_bytes = 0;
1039 out:
1040         return ret;
1041 }
1042
1043 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1044                        fmode_t flags, void *holder)
1045 {
1046         int ret;
1047
1048         mutex_lock(&uuid_mutex);
1049         if (fs_devices->opened) {
1050                 fs_devices->opened++;
1051                 ret = 0;
1052         } else {
1053                 ret = __btrfs_open_devices(fs_devices, flags, holder);
1054         }
1055         mutex_unlock(&uuid_mutex);
1056         return ret;
1057 }
1058
1059 static void btrfs_release_disk_super(struct page *page)
1060 {
1061         kunmap(page);
1062         put_page(page);
1063 }
1064
1065 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1066                                  struct page **page,
1067                                  struct btrfs_super_block **disk_super)
1068 {
1069         void *p;
1070         pgoff_t index;
1071
1072         /* make sure our super fits in the device */
1073         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1074                 return 1;
1075
1076         /* make sure our super fits in the page */
1077         if (sizeof(**disk_super) > PAGE_SIZE)
1078                 return 1;
1079
1080         /* make sure our super doesn't straddle pages on disk */
1081         index = bytenr >> PAGE_SHIFT;
1082         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1083                 return 1;
1084
1085         /* pull in the page with our super */
1086         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1087                                    index, GFP_KERNEL);
1088
1089         if (IS_ERR_OR_NULL(*page))
1090                 return 1;
1091
1092         p = kmap(*page);
1093
1094         /* align our pointer to the offset of the super block */
1095         *disk_super = p + (bytenr & ~PAGE_MASK);
1096
1097         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1098             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1099                 btrfs_release_disk_super(*page);
1100                 return 1;
1101         }
1102
1103         if ((*disk_super)->label[0] &&
1104                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1105                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1106
1107         return 0;
1108 }
1109
1110 /*
1111  * Look for a btrfs signature on a device. This may be called out of the mount path
1112  * and we are not allowed to call set_blocksize during the scan. The superblock
1113  * is read via pagecache
1114  */
1115 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1116                           struct btrfs_fs_devices **fs_devices_ret)
1117 {
1118         struct btrfs_super_block *disk_super;
1119         struct block_device *bdev;
1120         struct page *page;
1121         int ret = -EINVAL;
1122         u64 devid;
1123         u64 transid;
1124         u64 total_devices;
1125         u64 bytenr;
1126
1127         /*
1128          * we would like to check all the supers, but that would make
1129          * a btrfs mount succeed after a mkfs from a different FS.
1130          * So, we need to add a special mount option to scan for
1131          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1132          */
1133         bytenr = btrfs_sb_offset(0);
1134         flags |= FMODE_EXCL;
1135         mutex_lock(&uuid_mutex);
1136
1137         bdev = blkdev_get_by_path(path, flags, holder);
1138         if (IS_ERR(bdev)) {
1139                 ret = PTR_ERR(bdev);
1140                 goto error;
1141         }
1142
1143         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
1144                 goto error_bdev_put;
1145
1146         devid = btrfs_stack_device_id(&disk_super->dev_item);
1147         transid = btrfs_super_generation(disk_super);
1148         total_devices = btrfs_super_num_devices(disk_super);
1149
1150         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
1151         if (ret > 0) {
1152                 if (disk_super->label[0]) {
1153                         pr_info("BTRFS: device label %s ", disk_super->label);
1154                 } else {
1155                         pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
1156                 }
1157
1158                 pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
1159                 ret = 0;
1160         }
1161         if (!ret && fs_devices_ret)
1162                 (*fs_devices_ret)->total_devices = total_devices;
1163
1164         btrfs_release_disk_super(page);
1165
1166 error_bdev_put:
1167         blkdev_put(bdev, flags);
1168 error:
1169         mutex_unlock(&uuid_mutex);
1170         return ret;
1171 }
1172
1173 /* helper to account the used device space in the range */
1174 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1175                                    u64 end, u64 *length)
1176 {
1177         struct btrfs_key key;
1178         struct btrfs_root *root = device->fs_info->dev_root;
1179         struct btrfs_dev_extent *dev_extent;
1180         struct btrfs_path *path;
1181         u64 extent_end;
1182         int ret;
1183         int slot;
1184         struct extent_buffer *l;
1185
1186         *length = 0;
1187
1188         if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
1189                 return 0;
1190
1191         path = btrfs_alloc_path();
1192         if (!path)
1193                 return -ENOMEM;
1194         path->reada = READA_FORWARD;
1195
1196         key.objectid = device->devid;
1197         key.offset = start;
1198         key.type = BTRFS_DEV_EXTENT_KEY;
1199
1200         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1201         if (ret < 0)
1202                 goto out;
1203         if (ret > 0) {
1204                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1205                 if (ret < 0)
1206                         goto out;
1207         }
1208
1209         while (1) {
1210                 l = path->nodes[0];
1211                 slot = path->slots[0];
1212                 if (slot >= btrfs_header_nritems(l)) {
1213                         ret = btrfs_next_leaf(root, path);
1214                         if (ret == 0)
1215                                 continue;
1216                         if (ret < 0)
1217                                 goto out;
1218
1219                         break;
1220                 }
1221                 btrfs_item_key_to_cpu(l, &key, slot);
1222
1223                 if (key.objectid < device->devid)
1224                         goto next;
1225
1226                 if (key.objectid > device->devid)
1227                         break;
1228
1229                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1230                         goto next;
1231
1232                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1233                 extent_end = key.offset + btrfs_dev_extent_length(l,
1234                                                                   dev_extent);
1235                 if (key.offset <= start && extent_end > end) {
1236                         *length = end - start + 1;
1237                         break;
1238                 } else if (key.offset <= start && extent_end > start)
1239                         *length += extent_end - start;
1240                 else if (key.offset > start && extent_end <= end)
1241                         *length += extent_end - key.offset;
1242                 else if (key.offset > start && key.offset <= end) {
1243                         *length += end - key.offset + 1;
1244                         break;
1245                 } else if (key.offset > end)
1246                         break;
1247
1248 next:
1249                 path->slots[0]++;
1250         }
1251         ret = 0;
1252 out:
1253         btrfs_free_path(path);
1254         return ret;
1255 }
1256
1257 static int contains_pending_extent(struct btrfs_transaction *transaction,
1258                                    struct btrfs_device *device,
1259                                    u64 *start, u64 len)
1260 {
1261         struct btrfs_fs_info *fs_info = device->fs_info;
1262         struct extent_map *em;
1263         struct list_head *search_list = &fs_info->pinned_chunks;
1264         int ret = 0;
1265         u64 physical_start = *start;
1266
1267         if (transaction)
1268                 search_list = &transaction->pending_chunks;
1269 again:
1270         list_for_each_entry(em, search_list, list) {
1271                 struct map_lookup *map;
1272                 int i;
1273
1274                 map = em->map_lookup;
1275                 for (i = 0; i < map->num_stripes; i++) {
1276                         u64 end;
1277
1278                         if (map->stripes[i].dev != device)
1279                                 continue;
1280                         if (map->stripes[i].physical >= physical_start + len ||
1281                             map->stripes[i].physical + em->orig_block_len <=
1282                             physical_start)
1283                                 continue;
1284                         /*
1285                          * Make sure that while processing the pinned list we do
1286                          * not override our *start with a lower value, because
1287                          * we can have pinned chunks that fall within this
1288                          * device hole and that have lower physical addresses
1289                          * than the pending chunks we processed before. If we
1290                          * do not take this special care we can end up getting
1291                          * 2 pending chunks that start at the same physical
1292                          * device offsets because the end offset of a pinned
1293                          * chunk can be equal to the start offset of some
1294                          * pending chunk.
1295                          */
1296                         end = map->stripes[i].physical + em->orig_block_len;
1297                         if (end > *start) {
1298                                 *start = end;
1299                                 ret = 1;
1300                         }
1301                 }
1302         }
1303         if (search_list != &fs_info->pinned_chunks) {
1304                 search_list = &fs_info->pinned_chunks;
1305                 goto again;
1306         }
1307
1308         return ret;
1309 }
1310
1311
1312 /*
1313  * find_free_dev_extent_start - find free space in the specified device
1314  * @device:       the device which we search the free space in
1315  * @num_bytes:    the size of the free space that we need
1316  * @search_start: the position from which to begin the search
1317  * @start:        store the start of the free space.
1318  * @len:          the size of the free space. that we find, or the size
1319  *                of the max free space if we don't find suitable free space
1320  *
1321  * this uses a pretty simple search, the expectation is that it is
1322  * called very infrequently and that a given device has a small number
1323  * of extents
1324  *
1325  * @start is used to store the start of the free space if we find. But if we
1326  * don't find suitable free space, it will be used to store the start position
1327  * of the max free space.
1328  *
1329  * @len is used to store the size of the free space that we find.
1330  * But if we don't find suitable free space, it is used to store the size of
1331  * the max free space.
1332  */
1333 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1334                                struct btrfs_device *device, u64 num_bytes,
1335                                u64 search_start, u64 *start, u64 *len)
1336 {
1337         struct btrfs_fs_info *fs_info = device->fs_info;
1338         struct btrfs_root *root = fs_info->dev_root;
1339         struct btrfs_key key;
1340         struct btrfs_dev_extent *dev_extent;
1341         struct btrfs_path *path;
1342         u64 hole_size;
1343         u64 max_hole_start;
1344         u64 max_hole_size;
1345         u64 extent_end;
1346         u64 search_end = device->total_bytes;
1347         int ret;
1348         int slot;
1349         struct extent_buffer *l;
1350
1351         /*
1352          * We don't want to overwrite the superblock on the drive nor any area
1353          * used by the boot loader (grub for example), so we make sure to start
1354          * at an offset of at least 1MB.
1355          */
1356         search_start = max_t(u64, search_start, SZ_1M);
1357
1358         path = btrfs_alloc_path();
1359         if (!path)
1360                 return -ENOMEM;
1361
1362         max_hole_start = search_start;
1363         max_hole_size = 0;
1364
1365 again:
1366         if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1367                 ret = -ENOSPC;
1368                 goto out;
1369         }
1370
1371         path->reada = READA_FORWARD;
1372         path->search_commit_root = 1;
1373         path->skip_locking = 1;
1374
1375         key.objectid = device->devid;
1376         key.offset = search_start;
1377         key.type = BTRFS_DEV_EXTENT_KEY;
1378
1379         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1380         if (ret < 0)
1381                 goto out;
1382         if (ret > 0) {
1383                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1384                 if (ret < 0)
1385                         goto out;
1386         }
1387
1388         while (1) {
1389                 l = path->nodes[0];
1390                 slot = path->slots[0];
1391                 if (slot >= btrfs_header_nritems(l)) {
1392                         ret = btrfs_next_leaf(root, path);
1393                         if (ret == 0)
1394                                 continue;
1395                         if (ret < 0)
1396                                 goto out;
1397
1398                         break;
1399                 }
1400                 btrfs_item_key_to_cpu(l, &key, slot);
1401
1402                 if (key.objectid < device->devid)
1403                         goto next;
1404
1405                 if (key.objectid > device->devid)
1406                         break;
1407
1408                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1409                         goto next;
1410
1411                 if (key.offset > search_start) {
1412                         hole_size = key.offset - search_start;
1413
1414                         /*
1415                          * Have to check before we set max_hole_start, otherwise
1416                          * we could end up sending back this offset anyway.
1417                          */
1418                         if (contains_pending_extent(transaction, device,
1419                                                     &search_start,
1420                                                     hole_size)) {
1421                                 if (key.offset >= search_start) {
1422                                         hole_size = key.offset - search_start;
1423                                 } else {
1424                                         WARN_ON_ONCE(1);
1425                                         hole_size = 0;
1426                                 }
1427                         }
1428
1429                         if (hole_size > max_hole_size) {
1430                                 max_hole_start = search_start;
1431                                 max_hole_size = hole_size;
1432                         }
1433
1434                         /*
1435                          * If this free space is greater than which we need,
1436                          * it must be the max free space that we have found
1437                          * until now, so max_hole_start must point to the start
1438                          * of this free space and the length of this free space
1439                          * is stored in max_hole_size. Thus, we return
1440                          * max_hole_start and max_hole_size and go back to the
1441                          * caller.
1442                          */
1443                         if (hole_size >= num_bytes) {
1444                                 ret = 0;
1445                                 goto out;
1446                         }
1447                 }
1448
1449                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1450                 extent_end = key.offset + btrfs_dev_extent_length(l,
1451                                                                   dev_extent);
1452                 if (extent_end > search_start)
1453                         search_start = extent_end;
1454 next:
1455                 path->slots[0]++;
1456                 cond_resched();
1457         }
1458
1459         /*
1460          * At this point, search_start should be the end of
1461          * allocated dev extents, and when shrinking the device,
1462          * search_end may be smaller than search_start.
1463          */
1464         if (search_end > search_start) {
1465                 hole_size = search_end - search_start;
1466
1467                 if (contains_pending_extent(transaction, device, &search_start,
1468                                             hole_size)) {
1469                         btrfs_release_path(path);
1470                         goto again;
1471                 }
1472
1473                 if (hole_size > max_hole_size) {
1474                         max_hole_start = search_start;
1475                         max_hole_size = hole_size;
1476                 }
1477         }
1478
1479         /* See above. */
1480         if (max_hole_size < num_bytes)
1481                 ret = -ENOSPC;
1482         else
1483                 ret = 0;
1484
1485 out:
1486         btrfs_free_path(path);
1487         *start = max_hole_start;
1488         if (len)
1489                 *len = max_hole_size;
1490         return ret;
1491 }
1492
1493 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1494                          struct btrfs_device *device, u64 num_bytes,
1495                          u64 *start, u64 *len)
1496 {
1497         /* FIXME use last free of some kind */
1498         return find_free_dev_extent_start(trans->transaction, device,
1499                                           num_bytes, 0, start, len);
1500 }
1501
1502 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1503                           struct btrfs_device *device,
1504                           u64 start, u64 *dev_extent_len)
1505 {
1506         struct btrfs_fs_info *fs_info = device->fs_info;
1507         struct btrfs_root *root = fs_info->dev_root;
1508         int ret;
1509         struct btrfs_path *path;
1510         struct btrfs_key key;
1511         struct btrfs_key found_key;
1512         struct extent_buffer *leaf = NULL;
1513         struct btrfs_dev_extent *extent = NULL;
1514
1515         path = btrfs_alloc_path();
1516         if (!path)
1517                 return -ENOMEM;
1518
1519         key.objectid = device->devid;
1520         key.offset = start;
1521         key.type = BTRFS_DEV_EXTENT_KEY;
1522 again:
1523         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1524         if (ret > 0) {
1525                 ret = btrfs_previous_item(root, path, key.objectid,
1526                                           BTRFS_DEV_EXTENT_KEY);
1527                 if (ret)
1528                         goto out;
1529                 leaf = path->nodes[0];
1530                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1531                 extent = btrfs_item_ptr(leaf, path->slots[0],
1532                                         struct btrfs_dev_extent);
1533                 BUG_ON(found_key.offset > start || found_key.offset +
1534                        btrfs_dev_extent_length(leaf, extent) < start);
1535                 key = found_key;
1536                 btrfs_release_path(path);
1537                 goto again;
1538         } else if (ret == 0) {
1539                 leaf = path->nodes[0];
1540                 extent = btrfs_item_ptr(leaf, path->slots[0],
1541                                         struct btrfs_dev_extent);
1542         } else {
1543                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1544                 goto out;
1545         }
1546
1547         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1548
1549         ret = btrfs_del_item(trans, root, path);
1550         if (ret) {
1551                 btrfs_handle_fs_error(fs_info, ret,
1552                                       "Failed to remove dev extent item");
1553         } else {
1554                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1555         }
1556 out:
1557         btrfs_free_path(path);
1558         return ret;
1559 }
1560
1561 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1562                                   struct btrfs_device *device,
1563                                   u64 chunk_offset, u64 start, u64 num_bytes)
1564 {
1565         int ret;
1566         struct btrfs_path *path;
1567         struct btrfs_fs_info *fs_info = device->fs_info;
1568         struct btrfs_root *root = fs_info->dev_root;
1569         struct btrfs_dev_extent *extent;
1570         struct extent_buffer *leaf;
1571         struct btrfs_key key;
1572
1573         WARN_ON(!device->in_fs_metadata);
1574         WARN_ON(device->is_tgtdev_for_dev_replace);
1575         path = btrfs_alloc_path();
1576         if (!path)
1577                 return -ENOMEM;
1578
1579         key.objectid = device->devid;
1580         key.offset = start;
1581         key.type = BTRFS_DEV_EXTENT_KEY;
1582         ret = btrfs_insert_empty_item(trans, root, path, &key,
1583                                       sizeof(*extent));
1584         if (ret)
1585                 goto out;
1586
1587         leaf = path->nodes[0];
1588         extent = btrfs_item_ptr(leaf, path->slots[0],
1589                                 struct btrfs_dev_extent);
1590         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1591                                         BTRFS_CHUNK_TREE_OBJECTID);
1592         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1593                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1594         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1595
1596         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1597         btrfs_mark_buffer_dirty(leaf);
1598 out:
1599         btrfs_free_path(path);
1600         return ret;
1601 }
1602
1603 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1604 {
1605         struct extent_map_tree *em_tree;
1606         struct extent_map *em;
1607         struct rb_node *n;
1608         u64 ret = 0;
1609
1610         em_tree = &fs_info->mapping_tree.map_tree;
1611         read_lock(&em_tree->lock);
1612         n = rb_last(&em_tree->map);
1613         if (n) {
1614                 em = rb_entry(n, struct extent_map, rb_node);
1615                 ret = em->start + em->len;
1616         }
1617         read_unlock(&em_tree->lock);
1618
1619         return ret;
1620 }
1621
1622 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1623                                     u64 *devid_ret)
1624 {
1625         int ret;
1626         struct btrfs_key key;
1627         struct btrfs_key found_key;
1628         struct btrfs_path *path;
1629
1630         path = btrfs_alloc_path();
1631         if (!path)
1632                 return -ENOMEM;
1633
1634         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1635         key.type = BTRFS_DEV_ITEM_KEY;
1636         key.offset = (u64)-1;
1637
1638         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1639         if (ret < 0)
1640                 goto error;
1641
1642         BUG_ON(ret == 0); /* Corruption */
1643
1644         ret = btrfs_previous_item(fs_info->chunk_root, path,
1645                                   BTRFS_DEV_ITEMS_OBJECTID,
1646                                   BTRFS_DEV_ITEM_KEY);
1647         if (ret) {
1648                 *devid_ret = 1;
1649         } else {
1650                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1651                                       path->slots[0]);
1652                 *devid_ret = found_key.offset + 1;
1653         }
1654         ret = 0;
1655 error:
1656         btrfs_free_path(path);
1657         return ret;
1658 }
1659
1660 /*
1661  * the device information is stored in the chunk root
1662  * the btrfs_device struct should be fully filled in
1663  */
1664 static int btrfs_add_device(struct btrfs_trans_handle *trans,
1665                             struct btrfs_fs_info *fs_info,
1666                             struct btrfs_device *device)
1667 {
1668         struct btrfs_root *root = fs_info->chunk_root;
1669         int ret;
1670         struct btrfs_path *path;
1671         struct btrfs_dev_item *dev_item;
1672         struct extent_buffer *leaf;
1673         struct btrfs_key key;
1674         unsigned long ptr;
1675
1676         path = btrfs_alloc_path();
1677         if (!path)
1678                 return -ENOMEM;
1679
1680         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1681         key.type = BTRFS_DEV_ITEM_KEY;
1682         key.offset = device->devid;
1683
1684         ret = btrfs_insert_empty_item(trans, root, path, &key,
1685                                       sizeof(*dev_item));
1686         if (ret)
1687                 goto out;
1688
1689         leaf = path->nodes[0];
1690         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1691
1692         btrfs_set_device_id(leaf, dev_item, device->devid);
1693         btrfs_set_device_generation(leaf, dev_item, 0);
1694         btrfs_set_device_type(leaf, dev_item, device->type);
1695         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1696         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1697         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1698         btrfs_set_device_total_bytes(leaf, dev_item,
1699                                      btrfs_device_get_disk_total_bytes(device));
1700         btrfs_set_device_bytes_used(leaf, dev_item,
1701                                     btrfs_device_get_bytes_used(device));
1702         btrfs_set_device_group(leaf, dev_item, 0);
1703         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1704         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1705         btrfs_set_device_start_offset(leaf, dev_item, 0);
1706
1707         ptr = btrfs_device_uuid(dev_item);
1708         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1709         ptr = btrfs_device_fsid(dev_item);
1710         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1711         btrfs_mark_buffer_dirty(leaf);
1712
1713         ret = 0;
1714 out:
1715         btrfs_free_path(path);
1716         return ret;
1717 }
1718
1719 /*
1720  * Function to update ctime/mtime for a given device path.
1721  * Mainly used for ctime/mtime based probe like libblkid.
1722  */
1723 static void update_dev_time(const char *path_name)
1724 {
1725         struct file *filp;
1726
1727         filp = filp_open(path_name, O_RDWR, 0);
1728         if (IS_ERR(filp))
1729                 return;
1730         file_update_time(filp);
1731         filp_close(filp, NULL);
1732 }
1733
1734 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1735                              struct btrfs_device *device)
1736 {
1737         struct btrfs_root *root = fs_info->chunk_root;
1738         int ret;
1739         struct btrfs_path *path;
1740         struct btrfs_key key;
1741         struct btrfs_trans_handle *trans;
1742
1743         path = btrfs_alloc_path();
1744         if (!path)
1745                 return -ENOMEM;
1746
1747         trans = btrfs_start_transaction(root, 0);
1748         if (IS_ERR(trans)) {
1749                 btrfs_free_path(path);
1750                 return PTR_ERR(trans);
1751         }
1752         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1753         key.type = BTRFS_DEV_ITEM_KEY;
1754         key.offset = device->devid;
1755
1756         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1757         if (ret) {
1758                 if (ret > 0)
1759                         ret = -ENOENT;
1760                 btrfs_abort_transaction(trans, ret);
1761                 btrfs_end_transaction(trans);
1762                 goto out;
1763         }
1764
1765         ret = btrfs_del_item(trans, root, path);
1766         if (ret) {
1767                 btrfs_abort_transaction(trans, ret);
1768                 btrfs_end_transaction(trans);
1769         }
1770
1771 out:
1772         btrfs_free_path(path);
1773         if (!ret)
1774                 ret = btrfs_commit_transaction(trans);
1775         return ret;
1776 }
1777
1778 /*
1779  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1780  * filesystem. It's up to the caller to adjust that number regarding eg. device
1781  * replace.
1782  */
1783 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1784                 u64 num_devices)
1785 {
1786         u64 all_avail;
1787         unsigned seq;
1788         int i;
1789
1790         do {
1791                 seq = read_seqbegin(&fs_info->profiles_lock);
1792
1793                 all_avail = fs_info->avail_data_alloc_bits |
1794                             fs_info->avail_system_alloc_bits |
1795                             fs_info->avail_metadata_alloc_bits;
1796         } while (read_seqretry(&fs_info->profiles_lock, seq));
1797
1798         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1799                 if (!(all_avail & btrfs_raid_group[i]))
1800                         continue;
1801
1802                 if (num_devices < btrfs_raid_array[i].devs_min) {
1803                         int ret = btrfs_raid_mindev_error[i];
1804
1805                         if (ret)
1806                                 return ret;
1807                 }
1808         }
1809
1810         return 0;
1811 }
1812
1813 static struct btrfs_device * btrfs_find_next_active_device(
1814                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1815 {
1816         struct btrfs_device *next_device;
1817
1818         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1819                 if (next_device != device &&
1820                         !next_device->missing && next_device->bdev)
1821                         return next_device;
1822         }
1823
1824         return NULL;
1825 }
1826
1827 /*
1828  * Helper function to check if the given device is part of s_bdev / latest_bdev
1829  * and replace it with the provided or the next active device, in the context
1830  * where this function called, there should be always be another device (or
1831  * this_dev) which is active.
1832  */
1833 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1834                 struct btrfs_device *device, struct btrfs_device *this_dev)
1835 {
1836         struct btrfs_device *next_device;
1837
1838         if (this_dev)
1839                 next_device = this_dev;
1840         else
1841                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1842                                                                 device);
1843         ASSERT(next_device);
1844
1845         if (fs_info->sb->s_bdev &&
1846                         (fs_info->sb->s_bdev == device->bdev))
1847                 fs_info->sb->s_bdev = next_device->bdev;
1848
1849         if (fs_info->fs_devices->latest_bdev == device->bdev)
1850                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1851 }
1852
1853 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1854                 u64 devid)
1855 {
1856         struct btrfs_device *device;
1857         struct btrfs_fs_devices *cur_devices;
1858         u64 num_devices;
1859         int ret = 0;
1860
1861         mutex_lock(&uuid_mutex);
1862
1863         num_devices = fs_info->fs_devices->num_devices;
1864         btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
1865         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1866                 WARN_ON(num_devices < 1);
1867                 num_devices--;
1868         }
1869         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
1870
1871         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1872         if (ret)
1873                 goto out;
1874
1875         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1876                                            &device);
1877         if (ret)
1878                 goto out;
1879
1880         if (device->is_tgtdev_for_dev_replace) {
1881                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1882                 goto out;
1883         }
1884
1885         if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
1886                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1887                 goto out;
1888         }
1889
1890         if (device->writeable) {
1891                 mutex_lock(&fs_info->chunk_mutex);
1892                 list_del_init(&device->dev_alloc_list);
1893                 device->fs_devices->rw_devices--;
1894                 mutex_unlock(&fs_info->chunk_mutex);
1895         }
1896
1897         mutex_unlock(&uuid_mutex);
1898         ret = btrfs_shrink_device(device, 0);
1899         mutex_lock(&uuid_mutex);
1900         if (ret)
1901                 goto error_undo;
1902
1903         /*
1904          * TODO: the superblock still includes this device in its num_devices
1905          * counter although write_all_supers() is not locked out. This
1906          * could give a filesystem state which requires a degraded mount.
1907          */
1908         ret = btrfs_rm_dev_item(fs_info, device);
1909         if (ret)
1910                 goto error_undo;
1911
1912         device->in_fs_metadata = 0;
1913         btrfs_scrub_cancel_dev(fs_info, device);
1914
1915         /*
1916          * the device list mutex makes sure that we don't change
1917          * the device list while someone else is writing out all
1918          * the device supers. Whoever is writing all supers, should
1919          * lock the device list mutex before getting the number of
1920          * devices in the super block (super_copy). Conversely,
1921          * whoever updates the number of devices in the super block
1922          * (super_copy) should hold the device list mutex.
1923          */
1924
1925         cur_devices = device->fs_devices;
1926         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1927         list_del_rcu(&device->dev_list);
1928
1929         device->fs_devices->num_devices--;
1930         device->fs_devices->total_devices--;
1931
1932         if (device->missing)
1933                 device->fs_devices->missing_devices--;
1934
1935         btrfs_assign_next_active_device(fs_info, device, NULL);
1936
1937         if (device->bdev) {
1938                 device->fs_devices->open_devices--;
1939                 /* remove sysfs entry */
1940                 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
1941         }
1942
1943         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
1944         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
1945         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1946
1947         /*
1948          * at this point, the device is zero sized and detached from
1949          * the devices list.  All that's left is to zero out the old
1950          * supers and free the device.
1951          */
1952         if (device->writeable)
1953                 btrfs_scratch_superblocks(device->bdev, device->name->str);
1954
1955         btrfs_close_bdev(device);
1956         call_rcu(&device->rcu, free_device);
1957
1958         if (cur_devices->open_devices == 0) {
1959                 struct btrfs_fs_devices *fs_devices;
1960                 fs_devices = fs_info->fs_devices;
1961                 while (fs_devices) {
1962                         if (fs_devices->seed == cur_devices) {
1963                                 fs_devices->seed = cur_devices->seed;
1964                                 break;
1965                         }
1966                         fs_devices = fs_devices->seed;
1967                 }
1968                 cur_devices->seed = NULL;
1969                 __btrfs_close_devices(cur_devices);
1970                 free_fs_devices(cur_devices);
1971         }
1972
1973 out:
1974         mutex_unlock(&uuid_mutex);
1975         return ret;
1976
1977 error_undo:
1978         if (device->writeable) {
1979                 mutex_lock(&fs_info->chunk_mutex);
1980                 list_add(&device->dev_alloc_list,
1981                          &fs_info->fs_devices->alloc_list);
1982                 device->fs_devices->rw_devices++;
1983                 mutex_unlock(&fs_info->chunk_mutex);
1984         }
1985         goto out;
1986 }
1987
1988 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1989                                         struct btrfs_device *srcdev)
1990 {
1991         struct btrfs_fs_devices *fs_devices;
1992
1993         WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1994
1995         /*
1996          * in case of fs with no seed, srcdev->fs_devices will point
1997          * to fs_devices of fs_info. However when the dev being replaced is
1998          * a seed dev it will point to the seed's local fs_devices. In short
1999          * srcdev will have its correct fs_devices in both the cases.
2000          */
2001         fs_devices = srcdev->fs_devices;
2002
2003         list_del_rcu(&srcdev->dev_list);
2004         list_del(&srcdev->dev_alloc_list);
2005         fs_devices->num_devices--;
2006         if (srcdev->missing)
2007                 fs_devices->missing_devices--;
2008
2009         if (srcdev->writeable)
2010                 fs_devices->rw_devices--;
2011
2012         if (srcdev->bdev)
2013                 fs_devices->open_devices--;
2014 }
2015
2016 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2017                                       struct btrfs_device *srcdev)
2018 {
2019         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2020
2021         if (srcdev->writeable) {
2022                 /* zero out the old super if it is writable */
2023                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2024         }
2025
2026         btrfs_close_bdev(srcdev);
2027         call_rcu(&srcdev->rcu, free_device);
2028
2029         /* if this is no devs we rather delete the fs_devices */
2030         if (!fs_devices->num_devices) {
2031                 struct btrfs_fs_devices *tmp_fs_devices;
2032
2033                 /*
2034                  * On a mounted FS, num_devices can't be zero unless it's a
2035                  * seed. In case of a seed device being replaced, the replace
2036                  * target added to the sprout FS, so there will be no more
2037                  * device left under the seed FS.
2038                  */
2039                 ASSERT(fs_devices->seeding);
2040
2041                 tmp_fs_devices = fs_info->fs_devices;
2042                 while (tmp_fs_devices) {
2043                         if (tmp_fs_devices->seed == fs_devices) {
2044                                 tmp_fs_devices->seed = fs_devices->seed;
2045                                 break;
2046                         }
2047                         tmp_fs_devices = tmp_fs_devices->seed;
2048                 }
2049                 fs_devices->seed = NULL;
2050                 __btrfs_close_devices(fs_devices);
2051                 free_fs_devices(fs_devices);
2052         }
2053 }
2054
2055 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2056                                       struct btrfs_device *tgtdev)
2057 {
2058         mutex_lock(&uuid_mutex);
2059         WARN_ON(!tgtdev);
2060         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2061
2062         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2063
2064         if (tgtdev->bdev)
2065                 fs_info->fs_devices->open_devices--;
2066
2067         fs_info->fs_devices->num_devices--;
2068
2069         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2070
2071         list_del_rcu(&tgtdev->dev_list);
2072
2073         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2074         mutex_unlock(&uuid_mutex);
2075
2076         /*
2077          * The update_dev_time() with in btrfs_scratch_superblocks()
2078          * may lead to a call to btrfs_show_devname() which will try
2079          * to hold device_list_mutex. And here this device
2080          * is already out of device list, so we don't have to hold
2081          * the device_list_mutex lock.
2082          */
2083         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2084
2085         btrfs_close_bdev(tgtdev);
2086         call_rcu(&tgtdev->rcu, free_device);
2087 }
2088
2089 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2090                                      const char *device_path,
2091                                      struct btrfs_device **device)
2092 {
2093         int ret = 0;
2094         struct btrfs_super_block *disk_super;
2095         u64 devid;
2096         u8 *dev_uuid;
2097         struct block_device *bdev;
2098         struct buffer_head *bh;
2099
2100         *device = NULL;
2101         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2102                                     fs_info->bdev_holder, 0, &bdev, &bh);
2103         if (ret)
2104                 return ret;
2105         disk_super = (struct btrfs_super_block *)bh->b_data;
2106         devid = btrfs_stack_device_id(&disk_super->dev_item);
2107         dev_uuid = disk_super->dev_item.uuid;
2108         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2109         brelse(bh);
2110         if (!*device)
2111                 ret = -ENOENT;
2112         blkdev_put(bdev, FMODE_READ);
2113         return ret;
2114 }
2115
2116 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2117                                          const char *device_path,
2118                                          struct btrfs_device **device)
2119 {
2120         *device = NULL;
2121         if (strcmp(device_path, "missing") == 0) {
2122                 struct list_head *devices;
2123                 struct btrfs_device *tmp;
2124
2125                 devices = &fs_info->fs_devices->devices;
2126                 /*
2127                  * It is safe to read the devices since the volume_mutex
2128                  * is held by the caller.
2129                  */
2130                 list_for_each_entry(tmp, devices, dev_list) {
2131                         if (tmp->in_fs_metadata && !tmp->bdev) {
2132                                 *device = tmp;
2133                                 break;
2134                         }
2135                 }
2136
2137                 if (!*device)
2138                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2139
2140                 return 0;
2141         } else {
2142                 return btrfs_find_device_by_path(fs_info, device_path, device);
2143         }
2144 }
2145
2146 /*
2147  * Lookup a device given by device id, or the path if the id is 0.
2148  */
2149 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2150                                  const char *devpath,
2151                                  struct btrfs_device **device)
2152 {
2153         int ret;
2154
2155         if (devid) {
2156                 ret = 0;
2157                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2158                 if (!*device)
2159                         ret = -ENOENT;
2160         } else {
2161                 if (!devpath || !devpath[0])
2162                         return -EINVAL;
2163
2164                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2165                                                            device);
2166         }
2167         return ret;
2168 }
2169
2170 /*
2171  * does all the dirty work required for changing file system's UUID.
2172  */
2173 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2174 {
2175         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2176         struct btrfs_fs_devices *old_devices;
2177         struct btrfs_fs_devices *seed_devices;
2178         struct btrfs_super_block *disk_super = fs_info->super_copy;
2179         struct btrfs_device *device;
2180         u64 super_flags;
2181
2182         BUG_ON(!mutex_is_locked(&uuid_mutex));
2183         if (!fs_devices->seeding)
2184                 return -EINVAL;
2185
2186         seed_devices = alloc_fs_devices(NULL);
2187         if (IS_ERR(seed_devices))
2188                 return PTR_ERR(seed_devices);
2189
2190         old_devices = clone_fs_devices(fs_devices);
2191         if (IS_ERR(old_devices)) {
2192                 kfree(seed_devices);
2193                 return PTR_ERR(old_devices);
2194         }
2195
2196         list_add(&old_devices->list, &fs_uuids);
2197
2198         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2199         seed_devices->opened = 1;
2200         INIT_LIST_HEAD(&seed_devices->devices);
2201         INIT_LIST_HEAD(&seed_devices->alloc_list);
2202         mutex_init(&seed_devices->device_list_mutex);
2203
2204         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2205         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2206                               synchronize_rcu);
2207         list_for_each_entry(device, &seed_devices->devices, dev_list)
2208                 device->fs_devices = seed_devices;
2209
2210         mutex_lock(&fs_info->chunk_mutex);
2211         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2212         mutex_unlock(&fs_info->chunk_mutex);
2213
2214         fs_devices->seeding = 0;
2215         fs_devices->num_devices = 0;
2216         fs_devices->open_devices = 0;
2217         fs_devices->missing_devices = 0;
2218         fs_devices->rotating = 0;
2219         fs_devices->seed = seed_devices;
2220
2221         generate_random_uuid(fs_devices->fsid);
2222         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2223         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2224         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2225
2226         super_flags = btrfs_super_flags(disk_super) &
2227                       ~BTRFS_SUPER_FLAG_SEEDING;
2228         btrfs_set_super_flags(disk_super, super_flags);
2229
2230         return 0;
2231 }
2232
2233 /*
2234  * Store the expected generation for seed devices in device items.
2235  */
2236 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2237                                struct btrfs_fs_info *fs_info)
2238 {
2239         struct btrfs_root *root = fs_info->chunk_root;
2240         struct btrfs_path *path;
2241         struct extent_buffer *leaf;
2242         struct btrfs_dev_item *dev_item;
2243         struct btrfs_device *device;
2244         struct btrfs_key key;
2245         u8 fs_uuid[BTRFS_FSID_SIZE];
2246         u8 dev_uuid[BTRFS_UUID_SIZE];
2247         u64 devid;
2248         int ret;
2249
2250         path = btrfs_alloc_path();
2251         if (!path)
2252                 return -ENOMEM;
2253
2254         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2255         key.offset = 0;
2256         key.type = BTRFS_DEV_ITEM_KEY;
2257
2258         while (1) {
2259                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2260                 if (ret < 0)
2261                         goto error;
2262
2263                 leaf = path->nodes[0];
2264 next_slot:
2265                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2266                         ret = btrfs_next_leaf(root, path);
2267                         if (ret > 0)
2268                                 break;
2269                         if (ret < 0)
2270                                 goto error;
2271                         leaf = path->nodes[0];
2272                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2273                         btrfs_release_path(path);
2274                         continue;
2275                 }
2276
2277                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2278                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2279                     key.type != BTRFS_DEV_ITEM_KEY)
2280                         break;
2281
2282                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2283                                           struct btrfs_dev_item);
2284                 devid = btrfs_device_id(leaf, dev_item);
2285                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2286                                    BTRFS_UUID_SIZE);
2287                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2288                                    BTRFS_FSID_SIZE);
2289                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2290                 BUG_ON(!device); /* Logic error */
2291
2292                 if (device->fs_devices->seeding) {
2293                         btrfs_set_device_generation(leaf, dev_item,
2294                                                     device->generation);
2295                         btrfs_mark_buffer_dirty(leaf);
2296                 }
2297
2298                 path->slots[0]++;
2299                 goto next_slot;
2300         }
2301         ret = 0;
2302 error:
2303         btrfs_free_path(path);
2304         return ret;
2305 }
2306
2307 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2308 {
2309         struct btrfs_root *root = fs_info->dev_root;
2310         struct request_queue *q;
2311         struct btrfs_trans_handle *trans;
2312         struct btrfs_device *device;
2313         struct block_device *bdev;
2314         struct list_head *devices;
2315         struct super_block *sb = fs_info->sb;
2316         struct rcu_string *name;
2317         u64 tmp;
2318         int seeding_dev = 0;
2319         int ret = 0;
2320         bool unlocked = false;
2321
2322         if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2323                 return -EROFS;
2324
2325         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2326                                   fs_info->bdev_holder);
2327         if (IS_ERR(bdev))
2328                 return PTR_ERR(bdev);
2329
2330         if (fs_info->fs_devices->seeding) {
2331                 seeding_dev = 1;
2332                 down_write(&sb->s_umount);
2333                 mutex_lock(&uuid_mutex);
2334         }
2335
2336         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2337
2338         devices = &fs_info->fs_devices->devices;
2339
2340         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2341         list_for_each_entry(device, devices, dev_list) {
2342                 if (device->bdev == bdev) {
2343                         ret = -EEXIST;
2344                         mutex_unlock(
2345                                 &fs_info->fs_devices->device_list_mutex);
2346                         goto error;
2347                 }
2348         }
2349         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2350
2351         device = btrfs_alloc_device(fs_info, NULL, NULL);
2352         if (IS_ERR(device)) {
2353                 /* we can safely leave the fs_devices entry around */
2354                 ret = PTR_ERR(device);
2355                 goto error;
2356         }
2357
2358         name = rcu_string_strdup(device_path, GFP_KERNEL);
2359         if (!name) {
2360                 bio_put(device->flush_bio);
2361                 kfree(device);
2362                 ret = -ENOMEM;
2363                 goto error;
2364         }
2365         rcu_assign_pointer(device->name, name);
2366
2367         trans = btrfs_start_transaction(root, 0);
2368         if (IS_ERR(trans)) {
2369                 rcu_string_free(device->name);
2370                 bio_put(device->flush_bio);
2371                 kfree(device);
2372                 ret = PTR_ERR(trans);
2373                 goto error;
2374         }
2375
2376         q = bdev_get_queue(bdev);
2377         if (blk_queue_discard(q))
2378                 device->can_discard = 1;
2379         device->writeable = 1;
2380         device->generation = trans->transid;
2381         device->io_width = fs_info->sectorsize;
2382         device->io_align = fs_info->sectorsize;
2383         device->sector_size = fs_info->sectorsize;
2384         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2385                                          fs_info->sectorsize);
2386         device->disk_total_bytes = device->total_bytes;
2387         device->commit_total_bytes = device->total_bytes;
2388         device->fs_info = fs_info;
2389         device->bdev = bdev;
2390         device->in_fs_metadata = 1;
2391         device->is_tgtdev_for_dev_replace = 0;
2392         device->mode = FMODE_EXCL;
2393         device->dev_stats_valid = 1;
2394         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2395
2396         if (seeding_dev) {
2397                 sb->s_flags &= ~SB_RDONLY;
2398                 ret = btrfs_prepare_sprout(fs_info);
2399                 if (ret) {
2400                         btrfs_abort_transaction(trans, ret);
2401                         goto error_trans;
2402                 }
2403         }
2404
2405         device->fs_devices = fs_info->fs_devices;
2406
2407         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2408         mutex_lock(&fs_info->chunk_mutex);
2409         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2410         list_add(&device->dev_alloc_list,
2411                  &fs_info->fs_devices->alloc_list);
2412         fs_info->fs_devices->num_devices++;
2413         fs_info->fs_devices->open_devices++;
2414         fs_info->fs_devices->rw_devices++;
2415         fs_info->fs_devices->total_devices++;
2416         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2417
2418         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2419
2420         if (!blk_queue_nonrot(q))
2421                 fs_info->fs_devices->rotating = 1;
2422
2423         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2424         btrfs_set_super_total_bytes(fs_info->super_copy,
2425                 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2426
2427         tmp = btrfs_super_num_devices(fs_info->super_copy);
2428         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2429
2430         /* add sysfs device entry */
2431         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2432
2433         /*
2434          * we've got more storage, clear any full flags on the space
2435          * infos
2436          */
2437         btrfs_clear_space_info_full(fs_info);
2438
2439         mutex_unlock(&fs_info->chunk_mutex);
2440         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2441
2442         if (seeding_dev) {
2443                 mutex_lock(&fs_info->chunk_mutex);
2444                 ret = init_first_rw_device(trans, fs_info);
2445                 mutex_unlock(&fs_info->chunk_mutex);
2446                 if (ret) {
2447                         btrfs_abort_transaction(trans, ret);
2448                         goto error_sysfs;
2449                 }
2450         }
2451
2452         ret = btrfs_add_device(trans, fs_info, device);
2453         if (ret) {
2454                 btrfs_abort_transaction(trans, ret);
2455                 goto error_sysfs;
2456         }
2457
2458         if (seeding_dev) {
2459                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2460
2461                 ret = btrfs_finish_sprout(trans, fs_info);
2462                 if (ret) {
2463                         btrfs_abort_transaction(trans, ret);
2464                         goto error_sysfs;
2465                 }
2466
2467                 /* Sprouting would change fsid of the mounted root,
2468                  * so rename the fsid on the sysfs
2469                  */
2470                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2471                                                 fs_info->fsid);
2472                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2473                         btrfs_warn(fs_info,
2474                                    "sysfs: failed to create fsid for sprout");
2475         }
2476
2477         ret = btrfs_commit_transaction(trans);
2478
2479         if (seeding_dev) {
2480                 mutex_unlock(&uuid_mutex);
2481                 up_write(&sb->s_umount);
2482                 unlocked = true;
2483
2484                 if (ret) /* transaction commit */
2485                         return ret;
2486
2487                 ret = btrfs_relocate_sys_chunks(fs_info);
2488                 if (ret < 0)
2489                         btrfs_handle_fs_error(fs_info, ret,
2490                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2491                 trans = btrfs_attach_transaction(root);
2492                 if (IS_ERR(trans)) {
2493                         if (PTR_ERR(trans) == -ENOENT)
2494                                 return 0;
2495                         ret = PTR_ERR(trans);
2496                         trans = NULL;
2497                         goto error_sysfs;
2498                 }
2499                 ret = btrfs_commit_transaction(trans);
2500         }
2501
2502         /* Update ctime/mtime for libblkid */
2503         update_dev_time(device_path);
2504         return ret;
2505
2506 error_sysfs:
2507         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2508 error_trans:
2509         if (seeding_dev)
2510                 sb->s_flags |= SB_RDONLY;
2511         if (trans)
2512                 btrfs_end_transaction(trans);
2513         rcu_string_free(device->name);
2514         bio_put(device->flush_bio);
2515         kfree(device);
2516 error:
2517         blkdev_put(bdev, FMODE_EXCL);
2518         if (seeding_dev && !unlocked) {
2519                 mutex_unlock(&uuid_mutex);
2520                 up_write(&sb->s_umount);
2521         }
2522         return ret;
2523 }
2524
2525 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2526                                   const char *device_path,
2527                                   struct btrfs_device *srcdev,
2528                                   struct btrfs_device **device_out)
2529 {
2530         struct request_queue *q;
2531         struct btrfs_device *device;
2532         struct block_device *bdev;
2533         struct list_head *devices;
2534         struct rcu_string *name;
2535         u64 devid = BTRFS_DEV_REPLACE_DEVID;
2536         int ret = 0;
2537
2538         *device_out = NULL;
2539         if (fs_info->fs_devices->seeding) {
2540                 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2541                 return -EINVAL;
2542         }
2543
2544         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2545                                   fs_info->bdev_holder);
2546         if (IS_ERR(bdev)) {
2547                 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2548                 return PTR_ERR(bdev);
2549         }
2550
2551         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2552
2553         devices = &fs_info->fs_devices->devices;
2554         list_for_each_entry(device, devices, dev_list) {
2555                 if (device->bdev == bdev) {
2556                         btrfs_err(fs_info,
2557                                   "target device is in the filesystem!");
2558                         ret = -EEXIST;
2559                         goto error;
2560                 }
2561         }
2562
2563
2564         if (i_size_read(bdev->bd_inode) <
2565             btrfs_device_get_total_bytes(srcdev)) {
2566                 btrfs_err(fs_info,
2567                           "target device is smaller than source device!");
2568                 ret = -EINVAL;
2569                 goto error;
2570         }
2571
2572
2573         device = btrfs_alloc_device(NULL, &devid, NULL);
2574         if (IS_ERR(device)) {
2575                 ret = PTR_ERR(device);
2576                 goto error;
2577         }
2578
2579         name = rcu_string_strdup(device_path, GFP_KERNEL);
2580         if (!name) {
2581                 bio_put(device->flush_bio);
2582                 kfree(device);
2583                 ret = -ENOMEM;
2584                 goto error;
2585         }
2586         rcu_assign_pointer(device->name, name);
2587
2588         q = bdev_get_queue(bdev);
2589         if (blk_queue_discard(q))
2590                 device->can_discard = 1;
2591         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2592         device->writeable = 1;
2593         device->generation = 0;
2594         device->io_width = fs_info->sectorsize;
2595         device->io_align = fs_info->sectorsize;
2596         device->sector_size = fs_info->sectorsize;
2597         device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2598         device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2599         device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2600         ASSERT(list_empty(&srcdev->resized_list));
2601         device->commit_total_bytes = srcdev->commit_total_bytes;
2602         device->commit_bytes_used = device->bytes_used;
2603         device->fs_info = fs_info;
2604         device->bdev = bdev;
2605         device->in_fs_metadata = 1;
2606         device->is_tgtdev_for_dev_replace = 1;
2607         device->mode = FMODE_EXCL;
2608         device->dev_stats_valid = 1;
2609         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2610         device->fs_devices = fs_info->fs_devices;
2611         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2612         fs_info->fs_devices->num_devices++;
2613         fs_info->fs_devices->open_devices++;
2614         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2615
2616         *device_out = device;
2617         return ret;
2618
2619 error:
2620         blkdev_put(bdev, FMODE_EXCL);
2621         return ret;
2622 }
2623
2624 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2625                                               struct btrfs_device *tgtdev)
2626 {
2627         u32 sectorsize = fs_info->sectorsize;
2628
2629         WARN_ON(fs_info->fs_devices->rw_devices == 0);
2630         tgtdev->io_width = sectorsize;
2631         tgtdev->io_align = sectorsize;
2632         tgtdev->sector_size = sectorsize;
2633         tgtdev->fs_info = fs_info;
2634         tgtdev->in_fs_metadata = 1;
2635 }
2636
2637 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2638                                         struct btrfs_device *device)
2639 {
2640         int ret;
2641         struct btrfs_path *path;
2642         struct btrfs_root *root = device->fs_info->chunk_root;
2643         struct btrfs_dev_item *dev_item;
2644         struct extent_buffer *leaf;
2645         struct btrfs_key key;
2646
2647         path = btrfs_alloc_path();
2648         if (!path)
2649                 return -ENOMEM;
2650
2651         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2652         key.type = BTRFS_DEV_ITEM_KEY;
2653         key.offset = device->devid;
2654
2655         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2656         if (ret < 0)
2657                 goto out;
2658
2659         if (ret > 0) {
2660                 ret = -ENOENT;
2661                 goto out;
2662         }
2663
2664         leaf = path->nodes[0];
2665         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2666
2667         btrfs_set_device_id(leaf, dev_item, device->devid);
2668         btrfs_set_device_type(leaf, dev_item, device->type);
2669         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2670         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2671         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2672         btrfs_set_device_total_bytes(leaf, dev_item,
2673                                      btrfs_device_get_disk_total_bytes(device));
2674         btrfs_set_device_bytes_used(leaf, dev_item,
2675                                     btrfs_device_get_bytes_used(device));
2676         btrfs_mark_buffer_dirty(leaf);
2677
2678 out:
2679         btrfs_free_path(path);
2680         return ret;
2681 }
2682
2683 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2684                       struct btrfs_device *device, u64 new_size)
2685 {
2686         struct btrfs_fs_info *fs_info = device->fs_info;
2687         struct btrfs_super_block *super_copy = fs_info->super_copy;
2688         struct btrfs_fs_devices *fs_devices;
2689         u64 old_total;
2690         u64 diff;
2691
2692         if (!device->writeable)
2693                 return -EACCES;
2694
2695         new_size = round_down(new_size, fs_info->sectorsize);
2696
2697         mutex_lock(&fs_info->chunk_mutex);
2698         old_total = btrfs_super_total_bytes(super_copy);
2699         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2700
2701         if (new_size <= device->total_bytes ||
2702             device->is_tgtdev_for_dev_replace) {
2703                 mutex_unlock(&fs_info->chunk_mutex);
2704                 return -EINVAL;
2705         }
2706
2707         fs_devices = fs_info->fs_devices;
2708
2709         btrfs_set_super_total_bytes(super_copy,
2710                         round_down(old_total + diff, fs_info->sectorsize));
2711         device->fs_devices->total_rw_bytes += diff;
2712
2713         btrfs_device_set_total_bytes(device, new_size);
2714         btrfs_device_set_disk_total_bytes(device, new_size);
2715         btrfs_clear_space_info_full(device->fs_info);
2716         if (list_empty(&device->resized_list))
2717                 list_add_tail(&device->resized_list,
2718                               &fs_devices->resized_devices);
2719         mutex_unlock(&fs_info->chunk_mutex);
2720
2721         return btrfs_update_device(trans, device);
2722 }
2723
2724 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2725                             struct btrfs_fs_info *fs_info, u64 chunk_offset)
2726 {
2727         struct btrfs_root *root = fs_info->chunk_root;
2728         int ret;
2729         struct btrfs_path *path;
2730         struct btrfs_key key;
2731
2732         path = btrfs_alloc_path();
2733         if (!path)
2734                 return -ENOMEM;
2735
2736         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2737         key.offset = chunk_offset;
2738         key.type = BTRFS_CHUNK_ITEM_KEY;
2739
2740         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2741         if (ret < 0)
2742                 goto out;
2743         else if (ret > 0) { /* Logic error or corruption */
2744                 btrfs_handle_fs_error(fs_info, -ENOENT,
2745                                       "Failed lookup while freeing chunk.");
2746                 ret = -ENOENT;
2747                 goto out;
2748         }
2749
2750         ret = btrfs_del_item(trans, root, path);
2751         if (ret < 0)
2752                 btrfs_handle_fs_error(fs_info, ret,
2753                                       "Failed to delete chunk item.");
2754 out:
2755         btrfs_free_path(path);
2756         return ret;
2757 }
2758
2759 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2760 {
2761         struct btrfs_super_block *super_copy = fs_info->super_copy;
2762         struct btrfs_disk_key *disk_key;
2763         struct btrfs_chunk *chunk;
2764         u8 *ptr;
2765         int ret = 0;
2766         u32 num_stripes;
2767         u32 array_size;
2768         u32 len = 0;
2769         u32 cur;
2770         struct btrfs_key key;
2771
2772         mutex_lock(&fs_info->chunk_mutex);
2773         array_size = btrfs_super_sys_array_size(super_copy);
2774
2775         ptr = super_copy->sys_chunk_array;
2776         cur = 0;
2777
2778         while (cur < array_size) {
2779                 disk_key = (struct btrfs_disk_key *)ptr;
2780                 btrfs_disk_key_to_cpu(&key, disk_key);
2781
2782                 len = sizeof(*disk_key);
2783
2784                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2785                         chunk = (struct btrfs_chunk *)(ptr + len);
2786                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2787                         len += btrfs_chunk_item_size(num_stripes);
2788                 } else {
2789                         ret = -EIO;
2790                         break;
2791                 }
2792                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2793                     key.offset == chunk_offset) {
2794                         memmove(ptr, ptr + len, array_size - (cur + len));
2795                         array_size -= len;
2796                         btrfs_set_super_sys_array_size(super_copy, array_size);
2797                 } else {
2798                         ptr += len;
2799                         cur += len;
2800                 }
2801         }
2802         mutex_unlock(&fs_info->chunk_mutex);
2803         return ret;
2804 }
2805
2806 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2807                                         u64 logical, u64 length)
2808 {
2809         struct extent_map_tree *em_tree;
2810         struct extent_map *em;
2811
2812         em_tree = &fs_info->mapping_tree.map_tree;
2813         read_lock(&em_tree->lock);
2814         em = lookup_extent_mapping(em_tree, logical, length);
2815         read_unlock(&em_tree->lock);
2816
2817         if (!em) {
2818                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2819                            logical, length);
2820                 return ERR_PTR(-EINVAL);
2821         }
2822
2823         if (em->start > logical || em->start + em->len < logical) {
2824                 btrfs_crit(fs_info,
2825                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2826                            logical, length, em->start, em->start + em->len);
2827                 free_extent_map(em);
2828                 return ERR_PTR(-EINVAL);
2829         }
2830
2831         /* callers are responsible for dropping em's ref. */
2832         return em;
2833 }
2834
2835 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2836                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2837 {
2838         struct extent_map *em;
2839         struct map_lookup *map;
2840         u64 dev_extent_len = 0;
2841         int i, ret = 0;
2842         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2843
2844         em = get_chunk_map(fs_info, chunk_offset, 1);
2845         if (IS_ERR(em)) {
2846                 /*
2847                  * This is a logic error, but we don't want to just rely on the
2848                  * user having built with ASSERT enabled, so if ASSERT doesn't
2849                  * do anything we still error out.
2850                  */
2851                 ASSERT(0);
2852                 return PTR_ERR(em);
2853         }
2854         map = em->map_lookup;
2855         mutex_lock(&fs_info->chunk_mutex);
2856         check_system_chunk(trans, fs_info, map->type);
2857         mutex_unlock(&fs_info->chunk_mutex);
2858
2859         /*
2860          * Take the device list mutex to prevent races with the final phase of
2861          * a device replace operation that replaces the device object associated
2862          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2863          */
2864         mutex_lock(&fs_devices->device_list_mutex);
2865         for (i = 0; i < map->num_stripes; i++) {
2866                 struct btrfs_device *device = map->stripes[i].dev;
2867                 ret = btrfs_free_dev_extent(trans, device,
2868                                             map->stripes[i].physical,
2869                                             &dev_extent_len);
2870                 if (ret) {
2871                         mutex_unlock(&fs_devices->device_list_mutex);
2872                         btrfs_abort_transaction(trans, ret);
2873                         goto out;
2874                 }
2875
2876                 if (device->bytes_used > 0) {
2877                         mutex_lock(&fs_info->chunk_mutex);
2878                         btrfs_device_set_bytes_used(device,
2879                                         device->bytes_used - dev_extent_len);
2880                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2881                         btrfs_clear_space_info_full(fs_info);
2882                         mutex_unlock(&fs_info->chunk_mutex);
2883                 }
2884
2885                 if (map->stripes[i].dev) {
2886                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2887                         if (ret) {
2888                                 mutex_unlock(&fs_devices->device_list_mutex);
2889                                 btrfs_abort_transaction(trans, ret);
2890                                 goto out;
2891                         }
2892                 }
2893         }
2894         mutex_unlock(&fs_devices->device_list_mutex);
2895
2896         ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2897         if (ret) {
2898                 btrfs_abort_transaction(trans, ret);
2899                 goto out;
2900         }
2901
2902         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2903
2904         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2905                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2906                 if (ret) {
2907                         btrfs_abort_transaction(trans, ret);
2908                         goto out;
2909                 }
2910         }
2911
2912         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2913         if (ret) {
2914                 btrfs_abort_transaction(trans, ret);
2915                 goto out;
2916         }
2917
2918 out:
2919         /* once for us */
2920         free_extent_map(em);
2921         return ret;
2922 }
2923
2924 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2925 {
2926         struct btrfs_root *root = fs_info->chunk_root;
2927         struct btrfs_trans_handle *trans;
2928         int ret;
2929
2930         /*
2931          * Prevent races with automatic removal of unused block groups.
2932          * After we relocate and before we remove the chunk with offset
2933          * chunk_offset, automatic removal of the block group can kick in,
2934          * resulting in a failure when calling btrfs_remove_chunk() below.
2935          *
2936          * Make sure to acquire this mutex before doing a tree search (dev
2937          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2938          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2939          * we release the path used to search the chunk/dev tree and before
2940          * the current task acquires this mutex and calls us.
2941          */
2942         ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
2943
2944         ret = btrfs_can_relocate(fs_info, chunk_offset);
2945         if (ret)
2946                 return -ENOSPC;
2947
2948         /* step one, relocate all the extents inside this chunk */
2949         btrfs_scrub_pause(fs_info);
2950         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2951         btrfs_scrub_continue(fs_info);
2952         if (ret)
2953                 return ret;
2954
2955         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2956                                                      chunk_offset);
2957         if (IS_ERR(trans)) {
2958                 ret = PTR_ERR(trans);
2959                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2960                 return ret;
2961         }
2962
2963         /*
2964          * step two, delete the device extents and the
2965          * chunk tree entries
2966          */
2967         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2968         btrfs_end_transaction(trans);
2969         return ret;
2970 }
2971
2972 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2973 {
2974         struct btrfs_root *chunk_root = fs_info->chunk_root;
2975         struct btrfs_path *path;
2976         struct extent_buffer *leaf;
2977         struct btrfs_chunk *chunk;
2978         struct btrfs_key key;
2979         struct btrfs_key found_key;
2980         u64 chunk_type;
2981         bool retried = false;
2982         int failed = 0;
2983         int ret;
2984
2985         path = btrfs_alloc_path();
2986         if (!path)
2987                 return -ENOMEM;
2988
2989 again:
2990         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2991         key.offset = (u64)-1;
2992         key.type = BTRFS_CHUNK_ITEM_KEY;
2993
2994         while (1) {
2995                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2996                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2997                 if (ret < 0) {
2998                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2999                         goto error;
3000                 }
3001                 BUG_ON(ret == 0); /* Corruption */
3002
3003                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3004                                           key.type);
3005                 if (ret)
3006                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3007                 if (ret < 0)
3008                         goto error;
3009                 if (ret > 0)
3010                         break;
3011
3012                 leaf = path->nodes[0];
3013                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3014
3015                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3016                                        struct btrfs_chunk);
3017                 chunk_type = btrfs_chunk_type(leaf, chunk);
3018                 btrfs_release_path(path);
3019
3020                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3021                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3022                         if (ret == -ENOSPC)
3023                                 failed++;
3024                         else
3025                                 BUG_ON(ret);
3026                 }
3027                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3028
3029                 if (found_key.offset == 0)
3030                         break;
3031                 key.offset = found_key.offset - 1;
3032         }
3033         ret = 0;
3034         if (failed && !retried) {
3035                 failed = 0;
3036                 retried = true;
3037                 goto again;
3038         } else if (WARN_ON(failed && retried)) {
3039                 ret = -ENOSPC;
3040         }
3041 error:
3042         btrfs_free_path(path);
3043         return ret;
3044 }
3045
3046 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3047                                struct btrfs_balance_control *bctl)
3048 {
3049         struct btrfs_root *root = fs_info->tree_root;
3050         struct btrfs_trans_handle *trans;
3051         struct btrfs_balance_item *item;
3052         struct btrfs_disk_balance_args disk_bargs;
3053         struct btrfs_path *path;
3054         struct extent_buffer *leaf;
3055         struct btrfs_key key;
3056         int ret, err;
3057
3058         path = btrfs_alloc_path();
3059         if (!path)
3060                 return -ENOMEM;
3061
3062         trans = btrfs_start_transaction(root, 0);
3063         if (IS_ERR(trans)) {
3064                 btrfs_free_path(path);
3065                 return PTR_ERR(trans);
3066         }
3067
3068         key.objectid = BTRFS_BALANCE_OBJECTID;
3069         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3070         key.offset = 0;
3071
3072         ret = btrfs_insert_empty_item(trans, root, path, &key,
3073                                       sizeof(*item));
3074         if (ret)
3075                 goto out;
3076
3077         leaf = path->nodes[0];
3078         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3079
3080         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3081
3082         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3083         btrfs_set_balance_data(leaf, item, &disk_bargs);
3084         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3085         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3086         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3087         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3088
3089         btrfs_set_balance_flags(leaf, item, bctl->flags);
3090
3091         btrfs_mark_buffer_dirty(leaf);
3092 out:
3093         btrfs_free_path(path);
3094         err = btrfs_commit_transaction(trans);
3095         if (err && !ret)
3096                 ret = err;
3097         return ret;
3098 }
3099
3100 static int del_balance_item(struct btrfs_fs_info *fs_info)
3101 {
3102         struct btrfs_root *root = fs_info->tree_root;
3103         struct btrfs_trans_handle *trans;
3104         struct btrfs_path *path;
3105         struct btrfs_key key;
3106         int ret, err;
3107
3108         path = btrfs_alloc_path();
3109         if (!path)
3110                 return -ENOMEM;
3111
3112         trans = btrfs_start_transaction(root, 0);
3113         if (IS_ERR(trans)) {
3114                 btrfs_free_path(path);
3115                 return PTR_ERR(trans);
3116         }
3117
3118         key.objectid = BTRFS_BALANCE_OBJECTID;
3119         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3120         key.offset = 0;
3121
3122         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3123         if (ret < 0)
3124                 goto out;
3125         if (ret > 0) {
3126                 ret = -ENOENT;
3127                 goto out;
3128         }
3129
3130         ret = btrfs_del_item(trans, root, path);
3131 out:
3132         btrfs_free_path(path);
3133         err = btrfs_commit_transaction(trans);
3134         if (err && !ret)
3135                 ret = err;
3136         return ret;
3137 }
3138
3139 /*
3140  * This is a heuristic used to reduce the number of chunks balanced on
3141  * resume after balance was interrupted.
3142  */
3143 static void update_balance_args(struct btrfs_balance_control *bctl)
3144 {
3145         /*
3146          * Turn on soft mode for chunk types that were being converted.
3147          */
3148         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3149                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3150         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3151                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3152         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3153                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3154
3155         /*
3156          * Turn on usage filter if is not already used.  The idea is
3157          * that chunks that we have already balanced should be
3158          * reasonably full.  Don't do it for chunks that are being
3159          * converted - that will keep us from relocating unconverted
3160          * (albeit full) chunks.
3161          */
3162         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3163             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3164             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3165                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3166                 bctl->data.usage = 90;
3167         }
3168         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3169             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3170             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3171                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3172                 bctl->sys.usage = 90;
3173         }
3174         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3175             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3176             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3177                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3178                 bctl->meta.usage = 90;
3179         }
3180 }
3181
3182 /*
3183  * Should be called with both balance and volume mutexes held to
3184  * serialize other volume operations (add_dev/rm_dev/resize) with
3185  * restriper.  Same goes for unset_balance_control.
3186  */
3187 static void set_balance_control(struct btrfs_balance_control *bctl)
3188 {
3189         struct btrfs_fs_info *fs_info = bctl->fs_info;
3190
3191         BUG_ON(fs_info->balance_ctl);
3192
3193         spin_lock(&fs_info->balance_lock);
3194         fs_info->balance_ctl = bctl;
3195         spin_unlock(&fs_info->balance_lock);
3196 }
3197
3198 static void unset_balance_control(struct btrfs_fs_info *fs_info)
3199 {
3200         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3201
3202         BUG_ON(!fs_info->balance_ctl);
3203
3204         spin_lock(&fs_info->balance_lock);
3205         fs_info->balance_ctl = NULL;
3206         spin_unlock(&fs_info->balance_lock);
3207
3208         kfree(bctl);
3209 }
3210
3211 /*
3212  * Balance filters.  Return 1 if chunk should be filtered out
3213  * (should not be balanced).
3214  */
3215 static int chunk_profiles_filter(u64 chunk_type,
3216                                  struct btrfs_balance_args *bargs)
3217 {
3218         chunk_type = chunk_to_extended(chunk_type) &
3219                                 BTRFS_EXTENDED_PROFILE_MASK;
3220
3221         if (bargs->profiles & chunk_type)
3222                 return 0;
3223
3224         return 1;
3225 }
3226
3227 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3228                               struct btrfs_balance_args *bargs)
3229 {
3230         struct btrfs_block_group_cache *cache;
3231         u64 chunk_used;
3232         u64 user_thresh_min;
3233         u64 user_thresh_max;
3234         int ret = 1;
3235
3236         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3237         chunk_used = btrfs_block_group_used(&cache->item);
3238
3239       &