Merge branch 'allocator' of git://git.kernel.org/pub/scm/linux/kernel/git/arne/btrfs...
[sfrench/cifs-2.6.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <asm/div64.h>
27 #include "compat.h"
28 #include "ctree.h"
29 #include "extent_map.h"
30 #include "disk-io.h"
31 #include "transaction.h"
32 #include "print-tree.h"
33 #include "volumes.h"
34 #include "async-thread.h"
35
36 static int init_first_rw_device(struct btrfs_trans_handle *trans,
37                                 struct btrfs_root *root,
38                                 struct btrfs_device *device);
39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
40
41 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
42                             (sizeof(struct btrfs_bio_stripe) * (n)))
43
44 static DEFINE_MUTEX(uuid_mutex);
45 static LIST_HEAD(fs_uuids);
46
47 static void lock_chunks(struct btrfs_root *root)
48 {
49         mutex_lock(&root->fs_info->chunk_mutex);
50 }
51
52 static void unlock_chunks(struct btrfs_root *root)
53 {
54         mutex_unlock(&root->fs_info->chunk_mutex);
55 }
56
57 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
58 {
59         struct btrfs_device *device;
60         WARN_ON(fs_devices->opened);
61         while (!list_empty(&fs_devices->devices)) {
62                 device = list_entry(fs_devices->devices.next,
63                                     struct btrfs_device, dev_list);
64                 list_del(&device->dev_list);
65                 kfree(device->name);
66                 kfree(device);
67         }
68         kfree(fs_devices);
69 }
70
71 int btrfs_cleanup_fs_uuids(void)
72 {
73         struct btrfs_fs_devices *fs_devices;
74
75         while (!list_empty(&fs_uuids)) {
76                 fs_devices = list_entry(fs_uuids.next,
77                                         struct btrfs_fs_devices, list);
78                 list_del(&fs_devices->list);
79                 free_fs_devices(fs_devices);
80         }
81         return 0;
82 }
83
84 static noinline struct btrfs_device *__find_device(struct list_head *head,
85                                                    u64 devid, u8 *uuid)
86 {
87         struct btrfs_device *dev;
88
89         list_for_each_entry(dev, head, dev_list) {
90                 if (dev->devid == devid &&
91                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
92                         return dev;
93                 }
94         }
95         return NULL;
96 }
97
98 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
99 {
100         struct btrfs_fs_devices *fs_devices;
101
102         list_for_each_entry(fs_devices, &fs_uuids, list) {
103                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
104                         return fs_devices;
105         }
106         return NULL;
107 }
108
109 static void requeue_list(struct btrfs_pending_bios *pending_bios,
110                         struct bio *head, struct bio *tail)
111 {
112
113         struct bio *old_head;
114
115         old_head = pending_bios->head;
116         pending_bios->head = head;
117         if (pending_bios->tail)
118                 tail->bi_next = old_head;
119         else
120                 pending_bios->tail = tail;
121 }
122
123 /*
124  * we try to collect pending bios for a device so we don't get a large
125  * number of procs sending bios down to the same device.  This greatly
126  * improves the schedulers ability to collect and merge the bios.
127  *
128  * But, it also turns into a long list of bios to process and that is sure
129  * to eventually make the worker thread block.  The solution here is to
130  * make some progress and then put this work struct back at the end of
131  * the list if the block device is congested.  This way, multiple devices
132  * can make progress from a single worker thread.
133  */
134 static noinline int run_scheduled_bios(struct btrfs_device *device)
135 {
136         struct bio *pending;
137         struct backing_dev_info *bdi;
138         struct btrfs_fs_info *fs_info;
139         struct btrfs_pending_bios *pending_bios;
140         struct bio *tail;
141         struct bio *cur;
142         int again = 0;
143         unsigned long num_run;
144         unsigned long batch_run = 0;
145         unsigned long limit;
146         unsigned long last_waited = 0;
147         int force_reg = 0;
148         struct blk_plug plug;
149
150         /*
151          * this function runs all the bios we've collected for
152          * a particular device.  We don't want to wander off to
153          * another device without first sending all of these down.
154          * So, setup a plug here and finish it off before we return
155          */
156         blk_start_plug(&plug);
157
158         bdi = blk_get_backing_dev_info(device->bdev);
159         fs_info = device->dev_root->fs_info;
160         limit = btrfs_async_submit_limit(fs_info);
161         limit = limit * 2 / 3;
162
163 loop:
164         spin_lock(&device->io_lock);
165
166 loop_lock:
167         num_run = 0;
168
169         /* take all the bios off the list at once and process them
170          * later on (without the lock held).  But, remember the
171          * tail and other pointers so the bios can be properly reinserted
172          * into the list if we hit congestion
173          */
174         if (!force_reg && device->pending_sync_bios.head) {
175                 pending_bios = &device->pending_sync_bios;
176                 force_reg = 1;
177         } else {
178                 pending_bios = &device->pending_bios;
179                 force_reg = 0;
180         }
181
182         pending = pending_bios->head;
183         tail = pending_bios->tail;
184         WARN_ON(pending && !tail);
185
186         /*
187          * if pending was null this time around, no bios need processing
188          * at all and we can stop.  Otherwise it'll loop back up again
189          * and do an additional check so no bios are missed.
190          *
191          * device->running_pending is used to synchronize with the
192          * schedule_bio code.
193          */
194         if (device->pending_sync_bios.head == NULL &&
195             device->pending_bios.head == NULL) {
196                 again = 0;
197                 device->running_pending = 0;
198         } else {
199                 again = 1;
200                 device->running_pending = 1;
201         }
202
203         pending_bios->head = NULL;
204         pending_bios->tail = NULL;
205
206         spin_unlock(&device->io_lock);
207
208         while (pending) {
209
210                 rmb();
211                 /* we want to work on both lists, but do more bios on the
212                  * sync list than the regular list
213                  */
214                 if ((num_run > 32 &&
215                     pending_bios != &device->pending_sync_bios &&
216                     device->pending_sync_bios.head) ||
217                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
218                     device->pending_bios.head)) {
219                         spin_lock(&device->io_lock);
220                         requeue_list(pending_bios, pending, tail);
221                         goto loop_lock;
222                 }
223
224                 cur = pending;
225                 pending = pending->bi_next;
226                 cur->bi_next = NULL;
227                 atomic_dec(&fs_info->nr_async_bios);
228
229                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
230                     waitqueue_active(&fs_info->async_submit_wait))
231                         wake_up(&fs_info->async_submit_wait);
232
233                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
234
235                 submit_bio(cur->bi_rw, cur);
236                 num_run++;
237                 batch_run++;
238                 if (need_resched())
239                         cond_resched();
240
241                 /*
242                  * we made progress, there is more work to do and the bdi
243                  * is now congested.  Back off and let other work structs
244                  * run instead
245                  */
246                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
247                     fs_info->fs_devices->open_devices > 1) {
248                         struct io_context *ioc;
249
250                         ioc = current->io_context;
251
252                         /*
253                          * the main goal here is that we don't want to
254                          * block if we're going to be able to submit
255                          * more requests without blocking.
256                          *
257                          * This code does two great things, it pokes into
258                          * the elevator code from a filesystem _and_
259                          * it makes assumptions about how batching works.
260                          */
261                         if (ioc && ioc->nr_batch_requests > 0 &&
262                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
263                             (last_waited == 0 ||
264                              ioc->last_waited == last_waited)) {
265                                 /*
266                                  * we want to go through our batch of
267                                  * requests and stop.  So, we copy out
268                                  * the ioc->last_waited time and test
269                                  * against it before looping
270                                  */
271                                 last_waited = ioc->last_waited;
272                                 if (need_resched())
273                                         cond_resched();
274                                 continue;
275                         }
276                         spin_lock(&device->io_lock);
277                         requeue_list(pending_bios, pending, tail);
278                         device->running_pending = 1;
279
280                         spin_unlock(&device->io_lock);
281                         btrfs_requeue_work(&device->work);
282                         goto done;
283                 }
284         }
285
286         cond_resched();
287         if (again)
288                 goto loop;
289
290         spin_lock(&device->io_lock);
291         if (device->pending_bios.head || device->pending_sync_bios.head)
292                 goto loop_lock;
293         spin_unlock(&device->io_lock);
294
295 done:
296         blk_finish_plug(&plug);
297         return 0;
298 }
299
300 static void pending_bios_fn(struct btrfs_work *work)
301 {
302         struct btrfs_device *device;
303
304         device = container_of(work, struct btrfs_device, work);
305         run_scheduled_bios(device);
306 }
307
308 static noinline int device_list_add(const char *path,
309                            struct btrfs_super_block *disk_super,
310                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
311 {
312         struct btrfs_device *device;
313         struct btrfs_fs_devices *fs_devices;
314         u64 found_transid = btrfs_super_generation(disk_super);
315         char *name;
316
317         fs_devices = find_fsid(disk_super->fsid);
318         if (!fs_devices) {
319                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
320                 if (!fs_devices)
321                         return -ENOMEM;
322                 INIT_LIST_HEAD(&fs_devices->devices);
323                 INIT_LIST_HEAD(&fs_devices->alloc_list);
324                 list_add(&fs_devices->list, &fs_uuids);
325                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
326                 fs_devices->latest_devid = devid;
327                 fs_devices->latest_trans = found_transid;
328                 mutex_init(&fs_devices->device_list_mutex);
329                 device = NULL;
330         } else {
331                 device = __find_device(&fs_devices->devices, devid,
332                                        disk_super->dev_item.uuid);
333         }
334         if (!device) {
335                 if (fs_devices->opened)
336                         return -EBUSY;
337
338                 device = kzalloc(sizeof(*device), GFP_NOFS);
339                 if (!device) {
340                         /* we can safely leave the fs_devices entry around */
341                         return -ENOMEM;
342                 }
343                 device->devid = devid;
344                 device->work.func = pending_bios_fn;
345                 memcpy(device->uuid, disk_super->dev_item.uuid,
346                        BTRFS_UUID_SIZE);
347                 spin_lock_init(&device->io_lock);
348                 device->name = kstrdup(path, GFP_NOFS);
349                 if (!device->name) {
350                         kfree(device);
351                         return -ENOMEM;
352                 }
353                 INIT_LIST_HEAD(&device->dev_alloc_list);
354
355                 mutex_lock(&fs_devices->device_list_mutex);
356                 list_add(&device->dev_list, &fs_devices->devices);
357                 mutex_unlock(&fs_devices->device_list_mutex);
358
359                 device->fs_devices = fs_devices;
360                 fs_devices->num_devices++;
361         } else if (!device->name || strcmp(device->name, path)) {
362                 name = kstrdup(path, GFP_NOFS);
363                 if (!name)
364                         return -ENOMEM;
365                 kfree(device->name);
366                 device->name = name;
367                 if (device->missing) {
368                         fs_devices->missing_devices--;
369                         device->missing = 0;
370                 }
371         }
372
373         if (found_transid > fs_devices->latest_trans) {
374                 fs_devices->latest_devid = devid;
375                 fs_devices->latest_trans = found_transid;
376         }
377         *fs_devices_ret = fs_devices;
378         return 0;
379 }
380
381 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
382 {
383         struct btrfs_fs_devices *fs_devices;
384         struct btrfs_device *device;
385         struct btrfs_device *orig_dev;
386
387         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
388         if (!fs_devices)
389                 return ERR_PTR(-ENOMEM);
390
391         INIT_LIST_HEAD(&fs_devices->devices);
392         INIT_LIST_HEAD(&fs_devices->alloc_list);
393         INIT_LIST_HEAD(&fs_devices->list);
394         mutex_init(&fs_devices->device_list_mutex);
395         fs_devices->latest_devid = orig->latest_devid;
396         fs_devices->latest_trans = orig->latest_trans;
397         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
398
399         mutex_lock(&orig->device_list_mutex);
400         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
401                 device = kzalloc(sizeof(*device), GFP_NOFS);
402                 if (!device)
403                         goto error;
404
405                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
406                 if (!device->name) {
407                         kfree(device);
408                         goto error;
409                 }
410
411                 device->devid = orig_dev->devid;
412                 device->work.func = pending_bios_fn;
413                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
414                 spin_lock_init(&device->io_lock);
415                 INIT_LIST_HEAD(&device->dev_list);
416                 INIT_LIST_HEAD(&device->dev_alloc_list);
417
418                 list_add(&device->dev_list, &fs_devices->devices);
419                 device->fs_devices = fs_devices;
420                 fs_devices->num_devices++;
421         }
422         mutex_unlock(&orig->device_list_mutex);
423         return fs_devices;
424 error:
425         mutex_unlock(&orig->device_list_mutex);
426         free_fs_devices(fs_devices);
427         return ERR_PTR(-ENOMEM);
428 }
429
430 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
431 {
432         struct btrfs_device *device, *next;
433
434         mutex_lock(&uuid_mutex);
435 again:
436         mutex_lock(&fs_devices->device_list_mutex);
437         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
438                 if (device->in_fs_metadata)
439                         continue;
440
441                 if (device->bdev) {
442                         blkdev_put(device->bdev, device->mode);
443                         device->bdev = NULL;
444                         fs_devices->open_devices--;
445                 }
446                 if (device->writeable) {
447                         list_del_init(&device->dev_alloc_list);
448                         device->writeable = 0;
449                         fs_devices->rw_devices--;
450                 }
451                 list_del_init(&device->dev_list);
452                 fs_devices->num_devices--;
453                 kfree(device->name);
454                 kfree(device);
455         }
456         mutex_unlock(&fs_devices->device_list_mutex);
457
458         if (fs_devices->seed) {
459                 fs_devices = fs_devices->seed;
460                 goto again;
461         }
462
463         mutex_unlock(&uuid_mutex);
464         return 0;
465 }
466
467 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
468 {
469         struct btrfs_device *device;
470
471         if (--fs_devices->opened > 0)
472                 return 0;
473
474         list_for_each_entry(device, &fs_devices->devices, dev_list) {
475                 if (device->bdev) {
476                         blkdev_put(device->bdev, device->mode);
477                         fs_devices->open_devices--;
478                 }
479                 if (device->writeable) {
480                         list_del_init(&device->dev_alloc_list);
481                         fs_devices->rw_devices--;
482                 }
483
484                 device->bdev = NULL;
485                 device->writeable = 0;
486                 device->in_fs_metadata = 0;
487         }
488         WARN_ON(fs_devices->open_devices);
489         WARN_ON(fs_devices->rw_devices);
490         fs_devices->opened = 0;
491         fs_devices->seeding = 0;
492
493         return 0;
494 }
495
496 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
497 {
498         struct btrfs_fs_devices *seed_devices = NULL;
499         int ret;
500
501         mutex_lock(&uuid_mutex);
502         ret = __btrfs_close_devices(fs_devices);
503         if (!fs_devices->opened) {
504                 seed_devices = fs_devices->seed;
505                 fs_devices->seed = NULL;
506         }
507         mutex_unlock(&uuid_mutex);
508
509         while (seed_devices) {
510                 fs_devices = seed_devices;
511                 seed_devices = fs_devices->seed;
512                 __btrfs_close_devices(fs_devices);
513                 free_fs_devices(fs_devices);
514         }
515         return ret;
516 }
517
518 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
519                                 fmode_t flags, void *holder)
520 {
521         struct block_device *bdev;
522         struct list_head *head = &fs_devices->devices;
523         struct btrfs_device *device;
524         struct block_device *latest_bdev = NULL;
525         struct buffer_head *bh;
526         struct btrfs_super_block *disk_super;
527         u64 latest_devid = 0;
528         u64 latest_transid = 0;
529         u64 devid;
530         int seeding = 1;
531         int ret = 0;
532
533         flags |= FMODE_EXCL;
534
535         list_for_each_entry(device, head, dev_list) {
536                 if (device->bdev)
537                         continue;
538                 if (!device->name)
539                         continue;
540
541                 bdev = blkdev_get_by_path(device->name, flags, holder);
542                 if (IS_ERR(bdev)) {
543                         printk(KERN_INFO "open %s failed\n", device->name);
544                         goto error;
545                 }
546                 set_blocksize(bdev, 4096);
547
548                 bh = btrfs_read_dev_super(bdev);
549                 if (!bh) {
550                         ret = -EINVAL;
551                         goto error_close;
552                 }
553
554                 disk_super = (struct btrfs_super_block *)bh->b_data;
555                 devid = btrfs_stack_device_id(&disk_super->dev_item);
556                 if (devid != device->devid)
557                         goto error_brelse;
558
559                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
560                            BTRFS_UUID_SIZE))
561                         goto error_brelse;
562
563                 device->generation = btrfs_super_generation(disk_super);
564                 if (!latest_transid || device->generation > latest_transid) {
565                         latest_devid = devid;
566                         latest_transid = device->generation;
567                         latest_bdev = bdev;
568                 }
569
570                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
571                         device->writeable = 0;
572                 } else {
573                         device->writeable = !bdev_read_only(bdev);
574                         seeding = 0;
575                 }
576
577                 device->bdev = bdev;
578                 device->in_fs_metadata = 0;
579                 device->mode = flags;
580
581                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
582                         fs_devices->rotating = 1;
583
584                 fs_devices->open_devices++;
585                 if (device->writeable) {
586                         fs_devices->rw_devices++;
587                         list_add(&device->dev_alloc_list,
588                                  &fs_devices->alloc_list);
589                 }
590                 continue;
591
592 error_brelse:
593                 brelse(bh);
594 error_close:
595                 blkdev_put(bdev, flags);
596 error:
597                 continue;
598         }
599         if (fs_devices->open_devices == 0) {
600                 ret = -EIO;
601                 goto out;
602         }
603         fs_devices->seeding = seeding;
604         fs_devices->opened = 1;
605         fs_devices->latest_bdev = latest_bdev;
606         fs_devices->latest_devid = latest_devid;
607         fs_devices->latest_trans = latest_transid;
608         fs_devices->total_rw_bytes = 0;
609 out:
610         return ret;
611 }
612
613 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
614                        fmode_t flags, void *holder)
615 {
616         int ret;
617
618         mutex_lock(&uuid_mutex);
619         if (fs_devices->opened) {
620                 fs_devices->opened++;
621                 ret = 0;
622         } else {
623                 ret = __btrfs_open_devices(fs_devices, flags, holder);
624         }
625         mutex_unlock(&uuid_mutex);
626         return ret;
627 }
628
629 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
630                           struct btrfs_fs_devices **fs_devices_ret)
631 {
632         struct btrfs_super_block *disk_super;
633         struct block_device *bdev;
634         struct buffer_head *bh;
635         int ret;
636         u64 devid;
637         u64 transid;
638
639         mutex_lock(&uuid_mutex);
640
641         flags |= FMODE_EXCL;
642         bdev = blkdev_get_by_path(path, flags, holder);
643
644         if (IS_ERR(bdev)) {
645                 ret = PTR_ERR(bdev);
646                 goto error;
647         }
648
649         ret = set_blocksize(bdev, 4096);
650         if (ret)
651                 goto error_close;
652         bh = btrfs_read_dev_super(bdev);
653         if (!bh) {
654                 ret = -EINVAL;
655                 goto error_close;
656         }
657         disk_super = (struct btrfs_super_block *)bh->b_data;
658         devid = btrfs_stack_device_id(&disk_super->dev_item);
659         transid = btrfs_super_generation(disk_super);
660         if (disk_super->label[0])
661                 printk(KERN_INFO "device label %s ", disk_super->label);
662         else {
663                 /* FIXME, make a readl uuid parser */
664                 printk(KERN_INFO "device fsid %llx-%llx ",
665                        *(unsigned long long *)disk_super->fsid,
666                        *(unsigned long long *)(disk_super->fsid + 8));
667         }
668         printk(KERN_CONT "devid %llu transid %llu %s\n",
669                (unsigned long long)devid, (unsigned long long)transid, path);
670         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
671
672         brelse(bh);
673 error_close:
674         blkdev_put(bdev, flags);
675 error:
676         mutex_unlock(&uuid_mutex);
677         return ret;
678 }
679
680 /* helper to account the used device space in the range */
681 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
682                                    u64 end, u64 *length)
683 {
684         struct btrfs_key key;
685         struct btrfs_root *root = device->dev_root;
686         struct btrfs_dev_extent *dev_extent;
687         struct btrfs_path *path;
688         u64 extent_end;
689         int ret;
690         int slot;
691         struct extent_buffer *l;
692
693         *length = 0;
694
695         if (start >= device->total_bytes)
696                 return 0;
697
698         path = btrfs_alloc_path();
699         if (!path)
700                 return -ENOMEM;
701         path->reada = 2;
702
703         key.objectid = device->devid;
704         key.offset = start;
705         key.type = BTRFS_DEV_EXTENT_KEY;
706
707         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
708         if (ret < 0)
709                 goto out;
710         if (ret > 0) {
711                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
712                 if (ret < 0)
713                         goto out;
714         }
715
716         while (1) {
717                 l = path->nodes[0];
718                 slot = path->slots[0];
719                 if (slot >= btrfs_header_nritems(l)) {
720                         ret = btrfs_next_leaf(root, path);
721                         if (ret == 0)
722                                 continue;
723                         if (ret < 0)
724                                 goto out;
725
726                         break;
727                 }
728                 btrfs_item_key_to_cpu(l, &key, slot);
729
730                 if (key.objectid < device->devid)
731                         goto next;
732
733                 if (key.objectid > device->devid)
734                         break;
735
736                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
737                         goto next;
738
739                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
740                 extent_end = key.offset + btrfs_dev_extent_length(l,
741                                                                   dev_extent);
742                 if (key.offset <= start && extent_end > end) {
743                         *length = end - start + 1;
744                         break;
745                 } else if (key.offset <= start && extent_end > start)
746                         *length += extent_end - start;
747                 else if (key.offset > start && extent_end <= end)
748                         *length += extent_end - key.offset;
749                 else if (key.offset > start && key.offset <= end) {
750                         *length += end - key.offset + 1;
751                         break;
752                 } else if (key.offset > end)
753                         break;
754
755 next:
756                 path->slots[0]++;
757         }
758         ret = 0;
759 out:
760         btrfs_free_path(path);
761         return ret;
762 }
763
764 /*
765  * find_free_dev_extent - find free space in the specified device
766  * @trans:      transaction handler
767  * @device:     the device which we search the free space in
768  * @num_bytes:  the size of the free space that we need
769  * @start:      store the start of the free space.
770  * @len:        the size of the free space. that we find, or the size of the max
771  *              free space if we don't find suitable free space
772  *
773  * this uses a pretty simple search, the expectation is that it is
774  * called very infrequently and that a given device has a small number
775  * of extents
776  *
777  * @start is used to store the start of the free space if we find. But if we
778  * don't find suitable free space, it will be used to store the start position
779  * of the max free space.
780  *
781  * @len is used to store the size of the free space that we find.
782  * But if we don't find suitable free space, it is used to store the size of
783  * the max free space.
784  */
785 int find_free_dev_extent(struct btrfs_trans_handle *trans,
786                          struct btrfs_device *device, u64 num_bytes,
787                          u64 *start, u64 *len)
788 {
789         struct btrfs_key key;
790         struct btrfs_root *root = device->dev_root;
791         struct btrfs_dev_extent *dev_extent;
792         struct btrfs_path *path;
793         u64 hole_size;
794         u64 max_hole_start;
795         u64 max_hole_size;
796         u64 extent_end;
797         u64 search_start;
798         u64 search_end = device->total_bytes;
799         int ret;
800         int slot;
801         struct extent_buffer *l;
802
803         /* FIXME use last free of some kind */
804
805         /* we don't want to overwrite the superblock on the drive,
806          * so we make sure to start at an offset of at least 1MB
807          */
808         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
809
810         max_hole_start = search_start;
811         max_hole_size = 0;
812
813         if (search_start >= search_end) {
814                 ret = -ENOSPC;
815                 goto error;
816         }
817
818         path = btrfs_alloc_path();
819         if (!path) {
820                 ret = -ENOMEM;
821                 goto error;
822         }
823         path->reada = 2;
824
825         key.objectid = device->devid;
826         key.offset = search_start;
827         key.type = BTRFS_DEV_EXTENT_KEY;
828
829         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
830         if (ret < 0)
831                 goto out;
832         if (ret > 0) {
833                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
834                 if (ret < 0)
835                         goto out;
836         }
837
838         while (1) {
839                 l = path->nodes[0];
840                 slot = path->slots[0];
841                 if (slot >= btrfs_header_nritems(l)) {
842                         ret = btrfs_next_leaf(root, path);
843                         if (ret == 0)
844                                 continue;
845                         if (ret < 0)
846                                 goto out;
847
848                         break;
849                 }
850                 btrfs_item_key_to_cpu(l, &key, slot);
851
852                 if (key.objectid < device->devid)
853                         goto next;
854
855                 if (key.objectid > device->devid)
856                         break;
857
858                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
859                         goto next;
860
861                 if (key.offset > search_start) {
862                         hole_size = key.offset - search_start;
863
864                         if (hole_size > max_hole_size) {
865                                 max_hole_start = search_start;
866                                 max_hole_size = hole_size;
867                         }
868
869                         /*
870                          * If this free space is greater than which we need,
871                          * it must be the max free space that we have found
872                          * until now, so max_hole_start must point to the start
873                          * of this free space and the length of this free space
874                          * is stored in max_hole_size. Thus, we return
875                          * max_hole_start and max_hole_size and go back to the
876                          * caller.
877                          */
878                         if (hole_size >= num_bytes) {
879                                 ret = 0;
880                                 goto out;
881                         }
882                 }
883
884                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
885                 extent_end = key.offset + btrfs_dev_extent_length(l,
886                                                                   dev_extent);
887                 if (extent_end > search_start)
888                         search_start = extent_end;
889 next:
890                 path->slots[0]++;
891                 cond_resched();
892         }
893
894         hole_size = search_end- search_start;
895         if (hole_size > max_hole_size) {
896                 max_hole_start = search_start;
897                 max_hole_size = hole_size;
898         }
899
900         /* See above. */
901         if (hole_size < num_bytes)
902                 ret = -ENOSPC;
903         else
904                 ret = 0;
905
906 out:
907         btrfs_free_path(path);
908 error:
909         *start = max_hole_start;
910         if (len)
911                 *len = max_hole_size;
912         return ret;
913 }
914
915 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
916                           struct btrfs_device *device,
917                           u64 start)
918 {
919         int ret;
920         struct btrfs_path *path;
921         struct btrfs_root *root = device->dev_root;
922         struct btrfs_key key;
923         struct btrfs_key found_key;
924         struct extent_buffer *leaf = NULL;
925         struct btrfs_dev_extent *extent = NULL;
926
927         path = btrfs_alloc_path();
928         if (!path)
929                 return -ENOMEM;
930
931         key.objectid = device->devid;
932         key.offset = start;
933         key.type = BTRFS_DEV_EXTENT_KEY;
934
935         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
936         if (ret > 0) {
937                 ret = btrfs_previous_item(root, path, key.objectid,
938                                           BTRFS_DEV_EXTENT_KEY);
939                 BUG_ON(ret);
940                 leaf = path->nodes[0];
941                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
942                 extent = btrfs_item_ptr(leaf, path->slots[0],
943                                         struct btrfs_dev_extent);
944                 BUG_ON(found_key.offset > start || found_key.offset +
945                        btrfs_dev_extent_length(leaf, extent) < start);
946                 ret = 0;
947         } else if (ret == 0) {
948                 leaf = path->nodes[0];
949                 extent = btrfs_item_ptr(leaf, path->slots[0],
950                                         struct btrfs_dev_extent);
951         }
952         BUG_ON(ret);
953
954         if (device->bytes_used > 0)
955                 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
956         ret = btrfs_del_item(trans, root, path);
957         BUG_ON(ret);
958
959         btrfs_free_path(path);
960         return ret;
961 }
962
963 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
964                            struct btrfs_device *device,
965                            u64 chunk_tree, u64 chunk_objectid,
966                            u64 chunk_offset, u64 start, u64 num_bytes)
967 {
968         int ret;
969         struct btrfs_path *path;
970         struct btrfs_root *root = device->dev_root;
971         struct btrfs_dev_extent *extent;
972         struct extent_buffer *leaf;
973         struct btrfs_key key;
974
975         WARN_ON(!device->in_fs_metadata);
976         path = btrfs_alloc_path();
977         if (!path)
978                 return -ENOMEM;
979
980         key.objectid = device->devid;
981         key.offset = start;
982         key.type = BTRFS_DEV_EXTENT_KEY;
983         ret = btrfs_insert_empty_item(trans, root, path, &key,
984                                       sizeof(*extent));
985         BUG_ON(ret);
986
987         leaf = path->nodes[0];
988         extent = btrfs_item_ptr(leaf, path->slots[0],
989                                 struct btrfs_dev_extent);
990         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
991         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
992         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
993
994         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
995                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
996                     BTRFS_UUID_SIZE);
997
998         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
999         btrfs_mark_buffer_dirty(leaf);
1000         btrfs_free_path(path);
1001         return ret;
1002 }
1003
1004 static noinline int find_next_chunk(struct btrfs_root *root,
1005                                     u64 objectid, u64 *offset)
1006 {
1007         struct btrfs_path *path;
1008         int ret;
1009         struct btrfs_key key;
1010         struct btrfs_chunk *chunk;
1011         struct btrfs_key found_key;
1012
1013         path = btrfs_alloc_path();
1014         BUG_ON(!path);
1015
1016         key.objectid = objectid;
1017         key.offset = (u64)-1;
1018         key.type = BTRFS_CHUNK_ITEM_KEY;
1019
1020         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1021         if (ret < 0)
1022                 goto error;
1023
1024         BUG_ON(ret == 0);
1025
1026         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1027         if (ret) {
1028                 *offset = 0;
1029         } else {
1030                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1031                                       path->slots[0]);
1032                 if (found_key.objectid != objectid)
1033                         *offset = 0;
1034                 else {
1035                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1036                                                struct btrfs_chunk);
1037                         *offset = found_key.offset +
1038                                 btrfs_chunk_length(path->nodes[0], chunk);
1039                 }
1040         }
1041         ret = 0;
1042 error:
1043         btrfs_free_path(path);
1044         return ret;
1045 }
1046
1047 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1048 {
1049         int ret;
1050         struct btrfs_key key;
1051         struct btrfs_key found_key;
1052         struct btrfs_path *path;
1053
1054         root = root->fs_info->chunk_root;
1055
1056         path = btrfs_alloc_path();
1057         if (!path)
1058                 return -ENOMEM;
1059
1060         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1061         key.type = BTRFS_DEV_ITEM_KEY;
1062         key.offset = (u64)-1;
1063
1064         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1065         if (ret < 0)
1066                 goto error;
1067
1068         BUG_ON(ret == 0);
1069
1070         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1071                                   BTRFS_DEV_ITEM_KEY);
1072         if (ret) {
1073                 *objectid = 1;
1074         } else {
1075                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1076                                       path->slots[0]);
1077                 *objectid = found_key.offset + 1;
1078         }
1079         ret = 0;
1080 error:
1081         btrfs_free_path(path);
1082         return ret;
1083 }
1084
1085 /*
1086  * the device information is stored in the chunk root
1087  * the btrfs_device struct should be fully filled in
1088  */
1089 int btrfs_add_device(struct btrfs_trans_handle *trans,
1090                      struct btrfs_root *root,
1091                      struct btrfs_device *device)
1092 {
1093         int ret;
1094         struct btrfs_path *path;
1095         struct btrfs_dev_item *dev_item;
1096         struct extent_buffer *leaf;
1097         struct btrfs_key key;
1098         unsigned long ptr;
1099
1100         root = root->fs_info->chunk_root;
1101
1102         path = btrfs_alloc_path();
1103         if (!path)
1104                 return -ENOMEM;
1105
1106         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1107         key.type = BTRFS_DEV_ITEM_KEY;
1108         key.offset = device->devid;
1109
1110         ret = btrfs_insert_empty_item(trans, root, path, &key,
1111                                       sizeof(*dev_item));
1112         if (ret)
1113                 goto out;
1114
1115         leaf = path->nodes[0];
1116         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1117
1118         btrfs_set_device_id(leaf, dev_item, device->devid);
1119         btrfs_set_device_generation(leaf, dev_item, 0);
1120         btrfs_set_device_type(leaf, dev_item, device->type);
1121         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1122         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1123         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1124         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1125         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1126         btrfs_set_device_group(leaf, dev_item, 0);
1127         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1128         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1129         btrfs_set_device_start_offset(leaf, dev_item, 0);
1130
1131         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1132         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1133         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1134         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1135         btrfs_mark_buffer_dirty(leaf);
1136
1137         ret = 0;
1138 out:
1139         btrfs_free_path(path);
1140         return ret;
1141 }
1142
1143 static int btrfs_rm_dev_item(struct btrfs_root *root,
1144                              struct btrfs_device *device)
1145 {
1146         int ret;
1147         struct btrfs_path *path;
1148         struct btrfs_key key;
1149         struct btrfs_trans_handle *trans;
1150
1151         root = root->fs_info->chunk_root;
1152
1153         path = btrfs_alloc_path();
1154         if (!path)
1155                 return -ENOMEM;
1156
1157         trans = btrfs_start_transaction(root, 0);
1158         if (IS_ERR(trans)) {
1159                 btrfs_free_path(path);
1160                 return PTR_ERR(trans);
1161         }
1162         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1163         key.type = BTRFS_DEV_ITEM_KEY;
1164         key.offset = device->devid;
1165         lock_chunks(root);
1166
1167         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1168         if (ret < 0)
1169                 goto out;
1170
1171         if (ret > 0) {
1172                 ret = -ENOENT;
1173                 goto out;
1174         }
1175
1176         ret = btrfs_del_item(trans, root, path);
1177         if (ret)
1178                 goto out;
1179 out:
1180         btrfs_free_path(path);
1181         unlock_chunks(root);
1182         btrfs_commit_transaction(trans, root);
1183         return ret;
1184 }
1185
1186 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1187 {
1188         struct btrfs_device *device;
1189         struct btrfs_device *next_device;
1190         struct block_device *bdev;
1191         struct buffer_head *bh = NULL;
1192         struct btrfs_super_block *disk_super;
1193         u64 all_avail;
1194         u64 devid;
1195         u64 num_devices;
1196         u8 *dev_uuid;
1197         int ret = 0;
1198
1199         mutex_lock(&uuid_mutex);
1200         mutex_lock(&root->fs_info->volume_mutex);
1201
1202         all_avail = root->fs_info->avail_data_alloc_bits |
1203                 root->fs_info->avail_system_alloc_bits |
1204                 root->fs_info->avail_metadata_alloc_bits;
1205
1206         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1207             root->fs_info->fs_devices->num_devices <= 4) {
1208                 printk(KERN_ERR "btrfs: unable to go below four devices "
1209                        "on raid10\n");
1210                 ret = -EINVAL;
1211                 goto out;
1212         }
1213
1214         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1215             root->fs_info->fs_devices->num_devices <= 2) {
1216                 printk(KERN_ERR "btrfs: unable to go below two "
1217                        "devices on raid1\n");
1218                 ret = -EINVAL;
1219                 goto out;
1220         }
1221
1222         if (strcmp(device_path, "missing") == 0) {
1223                 struct list_head *devices;
1224                 struct btrfs_device *tmp;
1225
1226                 device = NULL;
1227                 devices = &root->fs_info->fs_devices->devices;
1228                 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1229                 list_for_each_entry(tmp, devices, dev_list) {
1230                         if (tmp->in_fs_metadata && !tmp->bdev) {
1231                                 device = tmp;
1232                                 break;
1233                         }
1234                 }
1235                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1236                 bdev = NULL;
1237                 bh = NULL;
1238                 disk_super = NULL;
1239                 if (!device) {
1240                         printk(KERN_ERR "btrfs: no missing devices found to "
1241                                "remove\n");
1242                         goto out;
1243                 }
1244         } else {
1245                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1246                                           root->fs_info->bdev_holder);
1247                 if (IS_ERR(bdev)) {
1248                         ret = PTR_ERR(bdev);
1249                         goto out;
1250                 }
1251
1252                 set_blocksize(bdev, 4096);
1253                 bh = btrfs_read_dev_super(bdev);
1254                 if (!bh) {
1255                         ret = -EINVAL;
1256                         goto error_close;
1257                 }
1258                 disk_super = (struct btrfs_super_block *)bh->b_data;
1259                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1260                 dev_uuid = disk_super->dev_item.uuid;
1261                 device = btrfs_find_device(root, devid, dev_uuid,
1262                                            disk_super->fsid);
1263                 if (!device) {
1264                         ret = -ENOENT;
1265                         goto error_brelse;
1266                 }
1267         }
1268
1269         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1270                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1271                        "device\n");
1272                 ret = -EINVAL;
1273                 goto error_brelse;
1274         }
1275
1276         if (device->writeable) {
1277                 list_del_init(&device->dev_alloc_list);
1278                 root->fs_info->fs_devices->rw_devices--;
1279         }
1280
1281         ret = btrfs_shrink_device(device, 0);
1282         if (ret)
1283                 goto error_undo;
1284
1285         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1286         if (ret)
1287                 goto error_undo;
1288
1289         device->in_fs_metadata = 0;
1290
1291         /*
1292          * the device list mutex makes sure that we don't change
1293          * the device list while someone else is writing out all
1294          * the device supers.
1295          */
1296         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1297         list_del_init(&device->dev_list);
1298         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1299
1300         device->fs_devices->num_devices--;
1301
1302         if (device->missing)
1303                 root->fs_info->fs_devices->missing_devices--;
1304
1305         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1306                                  struct btrfs_device, dev_list);
1307         if (device->bdev == root->fs_info->sb->s_bdev)
1308                 root->fs_info->sb->s_bdev = next_device->bdev;
1309         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1310                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1311
1312         if (device->bdev) {
1313                 blkdev_put(device->bdev, device->mode);
1314                 device->bdev = NULL;
1315                 device->fs_devices->open_devices--;
1316         }
1317
1318         num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1319         btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1320
1321         if (device->fs_devices->open_devices == 0) {
1322                 struct btrfs_fs_devices *fs_devices;
1323                 fs_devices = root->fs_info->fs_devices;
1324                 while (fs_devices) {
1325                         if (fs_devices->seed == device->fs_devices)
1326                                 break;
1327                         fs_devices = fs_devices->seed;
1328                 }
1329                 fs_devices->seed = device->fs_devices->seed;
1330                 device->fs_devices->seed = NULL;
1331                 __btrfs_close_devices(device->fs_devices);
1332                 free_fs_devices(device->fs_devices);
1333         }
1334
1335         /*
1336          * at this point, the device is zero sized.  We want to
1337          * remove it from the devices list and zero out the old super
1338          */
1339         if (device->writeable) {
1340                 /* make sure this device isn't detected as part of
1341                  * the FS anymore
1342                  */
1343                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1344                 set_buffer_dirty(bh);
1345                 sync_dirty_buffer(bh);
1346         }
1347
1348         kfree(device->name);
1349         kfree(device);
1350         ret = 0;
1351
1352 error_brelse:
1353         brelse(bh);
1354 error_close:
1355         if (bdev)
1356                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1357 out:
1358         mutex_unlock(&root->fs_info->volume_mutex);
1359         mutex_unlock(&uuid_mutex);
1360         return ret;
1361 error_undo:
1362         if (device->writeable) {
1363                 list_add(&device->dev_alloc_list,
1364                          &root->fs_info->fs_devices->alloc_list);
1365                 root->fs_info->fs_devices->rw_devices++;
1366         }
1367         goto error_brelse;
1368 }
1369
1370 /*
1371  * does all the dirty work required for changing file system's UUID.
1372  */
1373 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1374                                 struct btrfs_root *root)
1375 {
1376         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1377         struct btrfs_fs_devices *old_devices;
1378         struct btrfs_fs_devices *seed_devices;
1379         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1380         struct btrfs_device *device;
1381         u64 super_flags;
1382
1383         BUG_ON(!mutex_is_locked(&uuid_mutex));
1384         if (!fs_devices->seeding)
1385                 return -EINVAL;
1386
1387         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1388         if (!seed_devices)
1389                 return -ENOMEM;
1390
1391         old_devices = clone_fs_devices(fs_devices);
1392         if (IS_ERR(old_devices)) {
1393                 kfree(seed_devices);
1394                 return PTR_ERR(old_devices);
1395         }
1396
1397         list_add(&old_devices->list, &fs_uuids);
1398
1399         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1400         seed_devices->opened = 1;
1401         INIT_LIST_HEAD(&seed_devices->devices);
1402         INIT_LIST_HEAD(&seed_devices->alloc_list);
1403         mutex_init(&seed_devices->device_list_mutex);
1404         list_splice_init(&fs_devices->devices, &seed_devices->devices);
1405         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1406         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1407                 device->fs_devices = seed_devices;
1408         }
1409
1410         fs_devices->seeding = 0;
1411         fs_devices->num_devices = 0;
1412         fs_devices->open_devices = 0;
1413         fs_devices->seed = seed_devices;
1414
1415         generate_random_uuid(fs_devices->fsid);
1416         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1417         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1418         super_flags = btrfs_super_flags(disk_super) &
1419                       ~BTRFS_SUPER_FLAG_SEEDING;
1420         btrfs_set_super_flags(disk_super, super_flags);
1421
1422         return 0;
1423 }
1424
1425 /*
1426  * strore the expected generation for seed devices in device items.
1427  */
1428 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1429                                struct btrfs_root *root)
1430 {
1431         struct btrfs_path *path;
1432         struct extent_buffer *leaf;
1433         struct btrfs_dev_item *dev_item;
1434         struct btrfs_device *device;
1435         struct btrfs_key key;
1436         u8 fs_uuid[BTRFS_UUID_SIZE];
1437         u8 dev_uuid[BTRFS_UUID_SIZE];
1438         u64 devid;
1439         int ret;
1440
1441         path = btrfs_alloc_path();
1442         if (!path)
1443                 return -ENOMEM;
1444
1445         root = root->fs_info->chunk_root;
1446         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1447         key.offset = 0;
1448         key.type = BTRFS_DEV_ITEM_KEY;
1449
1450         while (1) {
1451                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1452                 if (ret < 0)
1453                         goto error;
1454
1455                 leaf = path->nodes[0];
1456 next_slot:
1457                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1458                         ret = btrfs_next_leaf(root, path);
1459                         if (ret > 0)
1460                                 break;
1461                         if (ret < 0)
1462                                 goto error;
1463                         leaf = path->nodes[0];
1464                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1465                         btrfs_release_path(path);
1466                         continue;
1467                 }
1468
1469                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1470                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1471                     key.type != BTRFS_DEV_ITEM_KEY)
1472                         break;
1473
1474                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1475                                           struct btrfs_dev_item);
1476                 devid = btrfs_device_id(leaf, dev_item);
1477                 read_extent_buffer(leaf, dev_uuid,
1478                                    (unsigned long)btrfs_device_uuid(dev_item),
1479                                    BTRFS_UUID_SIZE);
1480                 read_extent_buffer(leaf, fs_uuid,
1481                                    (unsigned long)btrfs_device_fsid(dev_item),
1482                                    BTRFS_UUID_SIZE);
1483                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1484                 BUG_ON(!device);
1485
1486                 if (device->fs_devices->seeding) {
1487                         btrfs_set_device_generation(leaf, dev_item,
1488                                                     device->generation);
1489                         btrfs_mark_buffer_dirty(leaf);
1490                 }
1491
1492                 path->slots[0]++;
1493                 goto next_slot;
1494         }
1495         ret = 0;
1496 error:
1497         btrfs_free_path(path);
1498         return ret;
1499 }
1500
1501 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1502 {
1503         struct btrfs_trans_handle *trans;
1504         struct btrfs_device *device;
1505         struct block_device *bdev;
1506         struct list_head *devices;
1507         struct super_block *sb = root->fs_info->sb;
1508         u64 total_bytes;
1509         int seeding_dev = 0;
1510         int ret = 0;
1511
1512         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1513                 return -EINVAL;
1514
1515         bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1516                                   root->fs_info->bdev_holder);
1517         if (IS_ERR(bdev))
1518                 return PTR_ERR(bdev);
1519
1520         if (root->fs_info->fs_devices->seeding) {
1521                 seeding_dev = 1;
1522                 down_write(&sb->s_umount);
1523                 mutex_lock(&uuid_mutex);
1524         }
1525
1526         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1527         mutex_lock(&root->fs_info->volume_mutex);
1528
1529         devices = &root->fs_info->fs_devices->devices;
1530         /*
1531          * we have the volume lock, so we don't need the extra
1532          * device list mutex while reading the list here.
1533          */
1534         list_for_each_entry(device, devices, dev_list) {
1535                 if (device->bdev == bdev) {
1536                         ret = -EEXIST;
1537                         goto error;
1538                 }
1539         }
1540
1541         device = kzalloc(sizeof(*device), GFP_NOFS);
1542         if (!device) {
1543                 /* we can safely leave the fs_devices entry around */
1544                 ret = -ENOMEM;
1545                 goto error;
1546         }
1547
1548         device->name = kstrdup(device_path, GFP_NOFS);
1549         if (!device->name) {
1550                 kfree(device);
1551                 ret = -ENOMEM;
1552                 goto error;
1553         }
1554
1555         ret = find_next_devid(root, &device->devid);
1556         if (ret) {
1557                 kfree(device->name);
1558                 kfree(device);
1559                 goto error;
1560         }
1561
1562         trans = btrfs_start_transaction(root, 0);
1563         if (IS_ERR(trans)) {
1564                 kfree(device->name);
1565                 kfree(device);
1566                 ret = PTR_ERR(trans);
1567                 goto error;
1568         }
1569
1570         lock_chunks(root);
1571
1572         device->writeable = 1;
1573         device->work.func = pending_bios_fn;
1574         generate_random_uuid(device->uuid);
1575         spin_lock_init(&device->io_lock);
1576         device->generation = trans->transid;
1577         device->io_width = root->sectorsize;
1578         device->io_align = root->sectorsize;
1579         device->sector_size = root->sectorsize;
1580         device->total_bytes = i_size_read(bdev->bd_inode);
1581         device->disk_total_bytes = device->total_bytes;
1582         device->dev_root = root->fs_info->dev_root;
1583         device->bdev = bdev;
1584         device->in_fs_metadata = 1;
1585         device->mode = FMODE_EXCL;
1586         set_blocksize(device->bdev, 4096);
1587
1588         if (seeding_dev) {
1589                 sb->s_flags &= ~MS_RDONLY;
1590                 ret = btrfs_prepare_sprout(trans, root);
1591                 BUG_ON(ret);
1592         }
1593
1594         device->fs_devices = root->fs_info->fs_devices;
1595
1596         /*
1597          * we don't want write_supers to jump in here with our device
1598          * half setup
1599          */
1600         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1601         list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1602         list_add(&device->dev_alloc_list,
1603                  &root->fs_info->fs_devices->alloc_list);
1604         root->fs_info->fs_devices->num_devices++;
1605         root->fs_info->fs_devices->open_devices++;
1606         root->fs_info->fs_devices->rw_devices++;
1607         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1608
1609         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1610                 root->fs_info->fs_devices->rotating = 1;
1611
1612         total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1613         btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1614                                     total_bytes + device->total_bytes);
1615
1616         total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1617         btrfs_set_super_num_devices(&root->fs_info->super_copy,
1618                                     total_bytes + 1);
1619         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1620
1621         if (seeding_dev) {
1622                 ret = init_first_rw_device(trans, root, device);
1623                 BUG_ON(ret);
1624                 ret = btrfs_finish_sprout(trans, root);
1625                 BUG_ON(ret);
1626         } else {
1627                 ret = btrfs_add_device(trans, root, device);
1628         }
1629
1630         /*
1631          * we've got more storage, clear any full flags on the space
1632          * infos
1633          */
1634         btrfs_clear_space_info_full(root->fs_info);
1635
1636         unlock_chunks(root);
1637         btrfs_commit_transaction(trans, root);
1638
1639         if (seeding_dev) {
1640                 mutex_unlock(&uuid_mutex);
1641                 up_write(&sb->s_umount);
1642
1643                 ret = btrfs_relocate_sys_chunks(root);
1644                 BUG_ON(ret);
1645         }
1646 out:
1647         mutex_unlock(&root->fs_info->volume_mutex);
1648         return ret;
1649 error:
1650         blkdev_put(bdev, FMODE_EXCL);
1651         if (seeding_dev) {
1652                 mutex_unlock(&uuid_mutex);
1653                 up_write(&sb->s_umount);
1654         }
1655         goto out;
1656 }
1657
1658 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1659                                         struct btrfs_device *device)
1660 {
1661         int ret;
1662         struct btrfs_path *path;
1663         struct btrfs_root *root;
1664         struct btrfs_dev_item *dev_item;
1665         struct extent_buffer *leaf;
1666         struct btrfs_key key;
1667
1668         root = device->dev_root->fs_info->chunk_root;
1669
1670         path = btrfs_alloc_path();
1671         if (!path)
1672                 return -ENOMEM;
1673
1674         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1675         key.type = BTRFS_DEV_ITEM_KEY;
1676         key.offset = device->devid;
1677
1678         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1679         if (ret < 0)
1680                 goto out;
1681
1682         if (ret > 0) {
1683                 ret = -ENOENT;
1684                 goto out;
1685         }
1686
1687         leaf = path->nodes[0];
1688         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1689
1690         btrfs_set_device_id(leaf, dev_item, device->devid);
1691         btrfs_set_device_type(leaf, dev_item, device->type);
1692         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1693         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1694         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1695         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1696         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1697         btrfs_mark_buffer_dirty(leaf);
1698
1699 out:
1700         btrfs_free_path(path);
1701         return ret;
1702 }
1703
1704 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1705                       struct btrfs_device *device, u64 new_size)
1706 {
1707         struct btrfs_super_block *super_copy =
1708                 &device->dev_root->fs_info->super_copy;
1709         u64 old_total = btrfs_super_total_bytes(super_copy);
1710         u64 diff = new_size - device->total_bytes;
1711
1712         if (!device->writeable)
1713                 return -EACCES;
1714         if (new_size <= device->total_bytes)
1715                 return -EINVAL;
1716
1717         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1718         device->fs_devices->total_rw_bytes += diff;
1719
1720         device->total_bytes = new_size;
1721         device->disk_total_bytes = new_size;
1722         btrfs_clear_space_info_full(device->dev_root->fs_info);
1723
1724         return btrfs_update_device(trans, device);
1725 }
1726
1727 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1728                       struct btrfs_device *device, u64 new_size)
1729 {
1730         int ret;
1731         lock_chunks(device->dev_root);
1732         ret = __btrfs_grow_device(trans, device, new_size);
1733         unlock_chunks(device->dev_root);
1734         return ret;
1735 }
1736
1737 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1738                             struct btrfs_root *root,
1739                             u64 chunk_tree, u64 chunk_objectid,
1740                             u64 chunk_offset)
1741 {
1742         int ret;
1743         struct btrfs_path *path;
1744         struct btrfs_key key;
1745
1746         root = root->fs_info->chunk_root;
1747         path = btrfs_alloc_path();
1748         if (!path)
1749                 return -ENOMEM;
1750
1751         key.objectid = chunk_objectid;
1752         key.offset = chunk_offset;
1753         key.type = BTRFS_CHUNK_ITEM_KEY;
1754
1755         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1756         BUG_ON(ret);
1757
1758         ret = btrfs_del_item(trans, root, path);
1759         BUG_ON(ret);
1760
1761         btrfs_free_path(path);
1762         return 0;
1763 }
1764
1765 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1766                         chunk_offset)
1767 {
1768         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1769         struct btrfs_disk_key *disk_key;
1770         struct btrfs_chunk *chunk;
1771         u8 *ptr;
1772         int ret = 0;
1773         u32 num_stripes;
1774         u32 array_size;
1775         u32 len = 0;
1776         u32 cur;
1777         struct btrfs_key key;
1778
1779         array_size = btrfs_super_sys_array_size(super_copy);
1780
1781         ptr = super_copy->sys_chunk_array;
1782         cur = 0;
1783
1784         while (cur < array_size) {
1785                 disk_key = (struct btrfs_disk_key *)ptr;
1786                 btrfs_disk_key_to_cpu(&key, disk_key);
1787
1788                 len = sizeof(*disk_key);
1789
1790                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1791                         chunk = (struct btrfs_chunk *)(ptr + len);
1792                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1793                         len += btrfs_chunk_item_size(num_stripes);
1794                 } else {
1795                         ret = -EIO;
1796                         break;
1797                 }
1798                 if (key.objectid == chunk_objectid &&
1799                     key.offset == chunk_offset) {
1800                         memmove(ptr, ptr + len, array_size - (cur + len));
1801                         array_size -= len;
1802                         btrfs_set_super_sys_array_size(super_copy, array_size);
1803                 } else {
1804                         ptr += len;
1805                         cur += len;
1806                 }
1807         }
1808         return ret;
1809 }
1810
1811 static int btrfs_relocate_chunk(struct btrfs_root *root,
1812                          u64 chunk_tree, u64 chunk_objectid,
1813                          u64 chunk_offset)
1814 {
1815         struct extent_map_tree *em_tree;
1816         struct btrfs_root *extent_root;
1817         struct btrfs_trans_handle *trans;
1818         struct extent_map *em;
1819         struct map_lookup *map;
1820         int ret;
1821         int i;
1822
1823         root = root->fs_info->chunk_root;
1824         extent_root = root->fs_info->extent_root;
1825         em_tree = &root->fs_info->mapping_tree.map_tree;
1826
1827         ret = btrfs_can_relocate(extent_root, chunk_offset);
1828         if (ret)
1829                 return -ENOSPC;
1830
1831         /* step one, relocate all the extents inside this chunk */
1832         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1833         if (ret)
1834                 return ret;
1835
1836         trans = btrfs_start_transaction(root, 0);
1837         BUG_ON(IS_ERR(trans));
1838
1839         lock_chunks(root);
1840
1841         /*
1842          * step two, delete the device extents and the
1843          * chunk tree entries
1844          */
1845         read_lock(&em_tree->lock);
1846         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1847         read_unlock(&em_tree->lock);
1848
1849         BUG_ON(em->start > chunk_offset ||
1850                em->start + em->len < chunk_offset);
1851         map = (struct map_lookup *)em->bdev;
1852
1853         for (i = 0; i < map->num_stripes; i++) {
1854                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1855                                             map->stripes[i].physical);
1856                 BUG_ON(ret);
1857
1858                 if (map->stripes[i].dev) {
1859                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1860                         BUG_ON(ret);
1861                 }
1862         }
1863         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1864                                chunk_offset);
1865
1866         BUG_ON(ret);
1867
1868         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1869
1870         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1871                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1872                 BUG_ON(ret);
1873         }
1874
1875         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1876         BUG_ON(ret);
1877
1878         write_lock(&em_tree->lock);
1879         remove_extent_mapping(em_tree, em);
1880         write_unlock(&em_tree->lock);
1881
1882         kfree(map);
1883         em->bdev = NULL;
1884
1885         /* once for the tree */
1886         free_extent_map(em);
1887         /* once for us */
1888         free_extent_map(em);
1889
1890         unlock_chunks(root);
1891         btrfs_end_transaction(trans, root);
1892         return 0;
1893 }
1894
1895 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1896 {
1897         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1898         struct btrfs_path *path;
1899         struct extent_buffer *leaf;
1900         struct btrfs_chunk *chunk;
1901         struct btrfs_key key;
1902         struct btrfs_key found_key;
1903         u64 chunk_tree = chunk_root->root_key.objectid;
1904         u64 chunk_type;
1905         bool retried = false;
1906         int failed = 0;
1907         int ret;
1908
1909         path = btrfs_alloc_path();
1910         if (!path)
1911                 return -ENOMEM;
1912
1913 again:
1914         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1915         key.offset = (u64)-1;
1916         key.type = BTRFS_CHUNK_ITEM_KEY;
1917
1918         while (1) {
1919                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1920                 if (ret < 0)
1921                         goto error;
1922                 BUG_ON(ret == 0);
1923
1924                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1925                                           key.type);
1926                 if (ret < 0)
1927                         goto error;
1928                 if (ret > 0)
1929                         break;
1930
1931                 leaf = path->nodes[0];
1932                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1933
1934                 chunk = btrfs_item_ptr(leaf, path->slots[0],
1935                                        struct btrfs_chunk);
1936                 chunk_type = btrfs_chunk_type(leaf, chunk);
1937                 btrfs_release_path(path);
1938
1939                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1940                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1941                                                    found_key.objectid,
1942                                                    found_key.offset);
1943                         if (ret == -ENOSPC)
1944                                 failed++;
1945                         else if (ret)
1946                                 BUG();
1947                 }
1948
1949                 if (found_key.offset == 0)
1950                         break;
1951                 key.offset = found_key.offset - 1;
1952         }
1953         ret = 0;
1954         if (failed && !retried) {
1955                 failed = 0;
1956                 retried = true;
1957                 goto again;
1958         } else if (failed && retried) {
1959                 WARN_ON(1);
1960                 ret = -ENOSPC;
1961         }
1962 error:
1963         btrfs_free_path(path);
1964         return ret;
1965 }
1966
1967 static u64 div_factor(u64 num, int factor)
1968 {
1969         if (factor == 10)
1970                 return num;
1971         num *= factor;
1972         do_div(num, 10);
1973         return num;
1974 }
1975
1976 int btrfs_balance(struct btrfs_root *dev_root)
1977 {
1978         int ret;
1979         struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1980         struct btrfs_device *device;
1981         u64 old_size;
1982         u64 size_to_free;
1983         struct btrfs_path *path;
1984         struct btrfs_key key;
1985         struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1986         struct btrfs_trans_handle *trans;
1987         struct btrfs_key found_key;
1988
1989         if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1990                 return -EROFS;
1991
1992         if (!capable(CAP_SYS_ADMIN))
1993                 return -EPERM;
1994
1995         mutex_lock(&dev_root->fs_info->volume_mutex);
1996         dev_root = dev_root->fs_info->dev_root;
1997
1998         /* step one make some room on all the devices */
1999         list_for_each_entry(device, devices, dev_list) {
2000                 old_size = device->total_bytes;
2001                 size_to_free = div_factor(old_size, 1);
2002                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2003                 if (!device->writeable ||
2004                     device->total_bytes - device->bytes_used > size_to_free)
2005                         continue;
2006
2007                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2008                 if (ret == -ENOSPC)
2009                         break;
2010                 BUG_ON(ret);
2011
2012                 trans = btrfs_start_transaction(dev_root, 0);
2013                 BUG_ON(IS_ERR(trans));
2014
2015                 ret = btrfs_grow_device(trans, device, old_size);
2016                 BUG_ON(ret);
2017
2018                 btrfs_end_transaction(trans, dev_root);
2019         }
2020
2021         /* step two, relocate all the chunks */
2022         path = btrfs_alloc_path();
2023         BUG_ON(!path);
2024
2025         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2026         key.offset = (u64)-1;
2027         key.type = BTRFS_CHUNK_ITEM_KEY;
2028
2029         while (1) {
2030                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2031                 if (ret < 0)
2032                         goto error;
2033
2034                 /*
2035                  * this shouldn't happen, it means the last relocate
2036                  * failed
2037                  */
2038                 if (ret == 0)
2039                         break;
2040
2041                 ret = btrfs_previous_item(chunk_root, path, 0,
2042                                           BTRFS_CHUNK_ITEM_KEY);
2043                 if (ret)
2044                         break;
2045
2046                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2047                                       path->slots[0]);
2048                 if (found_key.objectid != key.objectid)
2049                         break;
2050
2051                 /* chunk zero is special */
2052                 if (found_key.offset == 0)
2053                         break;
2054
2055                 btrfs_release_path(path);
2056                 ret = btrfs_relocate_chunk(chunk_root,
2057                                            chunk_root->root_key.objectid,
2058                                            found_key.objectid,
2059                                            found_key.offset);
2060                 BUG_ON(ret && ret != -ENOSPC);
2061                 key.offset = found_key.offset - 1;
2062         }
2063         ret = 0;
2064 error:
2065         btrfs_free_path(path);
2066         mutex_unlock(&dev_root->fs_info->volume_mutex);
2067         return ret;
2068 }
2069
2070 /*
2071  * shrinking a device means finding all of the device extents past
2072  * the new size, and then following the back refs to the chunks.
2073  * The chunk relocation code actually frees the device extent
2074  */
2075 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2076 {
2077         struct btrfs_trans_handle *trans;
2078         struct btrfs_root *root = device->dev_root;
2079         struct btrfs_dev_extent *dev_extent = NULL;
2080         struct btrfs_path *path;
2081         u64 length;
2082         u64 chunk_tree;
2083         u64 chunk_objectid;
2084         u64 chunk_offset;
2085         int ret;
2086         int slot;
2087         int failed = 0;
2088         bool retried = false;
2089         struct extent_buffer *l;
2090         struct btrfs_key key;
2091         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2092         u64 old_total = btrfs_super_total_bytes(super_copy);
2093         u64 old_size = device->total_bytes;
2094         u64 diff = device->total_bytes - new_size;
2095
2096         if (new_size >= device->total_bytes)
2097                 return -EINVAL;
2098
2099         path = btrfs_alloc_path();
2100         if (!path)
2101                 return -ENOMEM;
2102
2103         path->reada = 2;
2104
2105         lock_chunks(root);
2106
2107         device->total_bytes = new_size;
2108         if (device->writeable)
2109                 device->fs_devices->total_rw_bytes -= diff;
2110         unlock_chunks(root);
2111
2112 again:
2113         key.objectid = device->devid;
2114         key.offset = (u64)-1;
2115         key.type = BTRFS_DEV_EXTENT_KEY;
2116
2117         while (1) {
2118                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2119                 if (ret < 0)
2120                         goto done;
2121
2122                 ret = btrfs_previous_item(root, path, 0, key.type);
2123                 if (ret < 0)
2124                         goto done;
2125                 if (ret) {
2126                         ret = 0;
2127                         btrfs_release_path(path);
2128                         break;
2129                 }
2130
2131                 l = path->nodes[0];
2132                 slot = path->slots[0];
2133                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2134
2135                 if (key.objectid != device->devid) {
2136                         btrfs_release_path(path);
2137                         break;
2138                 }
2139
2140                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2141                 length = btrfs_dev_extent_length(l, dev_extent);
2142
2143                 if (key.offset + length <= new_size) {
2144                         btrfs_release_path(path);
2145                         break;
2146                 }
2147
2148                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2149                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2150                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2151                 btrfs_release_path(path);
2152
2153                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2154                                            chunk_offset);
2155                 if (ret && ret != -ENOSPC)
2156                         goto done;
2157                 if (ret == -ENOSPC)
2158                         failed++;
2159                 key.offset -= 1;
2160         }
2161
2162         if (failed && !retried) {
2163                 failed = 0;
2164                 retried = true;
2165                 goto again;
2166         } else if (failed && retried) {
2167                 ret = -ENOSPC;
2168                 lock_chunks(root);
2169
2170                 device->total_bytes = old_size;
2171                 if (device->writeable)
2172                         device->fs_devices->total_rw_bytes += diff;
2173                 unlock_chunks(root);
2174                 goto done;
2175         }
2176
2177         /* Shrinking succeeded, else we would be at "done". */
2178         trans = btrfs_start_transaction(root, 0);
2179         if (IS_ERR(trans)) {
2180                 ret = PTR_ERR(trans);
2181                 goto done;
2182         }
2183
2184         lock_chunks(root);
2185
2186         device->disk_total_bytes = new_size;
2187         /* Now btrfs_update_device() will change the on-disk size. */
2188         ret = btrfs_update_device(trans, device);
2189         if (ret) {
2190                 unlock_chunks(root);
2191                 btrfs_end_transaction(trans, root);
2192                 goto done;
2193         }
2194         WARN_ON(diff > old_total);
2195         btrfs_set_super_total_bytes(super_copy, old_total - diff);
2196         unlock_chunks(root);
2197         btrfs_end_transaction(trans, root);
2198 done:
2199         btrfs_free_path(path);
2200         return ret;
2201 }
2202
2203 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2204                            struct btrfs_root *root,
2205                            struct btrfs_key *key,
2206                            struct btrfs_chunk *chunk, int item_size)
2207 {
2208         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2209         struct btrfs_disk_key disk_key;
2210         u32 array_size;
2211         u8 *ptr;
2212
2213         array_size = btrfs_super_sys_array_size(super_copy);
2214         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
2215                 return -EFBIG;
2216
2217         ptr = super_copy->sys_chunk_array + array_size;
2218         btrfs_cpu_key_to_disk(&disk_key, key);
2219         memcpy(ptr, &disk_key, sizeof(disk_key));
2220         ptr += sizeof(disk_key);
2221         memcpy(ptr, chunk, item_size);
2222         item_size += sizeof(disk_key);
2223         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
2224         return 0;
2225 }
2226
2227 /*
2228  * sort the devices in descending order by max_avail, total_avail
2229  */
2230 static int btrfs_cmp_device_info(const void *a, const void *b)
2231 {
2232         const struct btrfs_device_info *di_a = a;
2233         const struct btrfs_device_info *di_b = b;
2234
2235         if (di_a->max_avail > di_b->max_avail)
2236                 return -1;
2237         if (di_a->max_avail < di_b->max_avail)
2238                 return 1;
2239         if (di_a->total_avail > di_b->total_avail)
2240                 return -1;
2241         if (di_a->total_avail < di_b->total_avail)
2242                 return 1;
2243         return 0;
2244 }
2245
2246 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2247                                struct btrfs_root *extent_root,
2248                                struct map_lookup **map_ret,
2249                                u64 *num_bytes_out, u64 *stripe_size_out,
2250                                u64 start, u64 type)
2251 {
2252         struct btrfs_fs_info *info = extent_root->fs_info;
2253         struct btrfs_fs_devices *fs_devices = info->fs_devices;
2254         struct list_head *cur;
2255         struct map_lookup *map = NULL;
2256         struct extent_map_tree *em_tree;
2257         struct extent_map *em;
2258         struct btrfs_device_info *devices_info = NULL;
2259         u64 total_avail;
2260         int num_stripes;        /* total number of stripes to allocate */
2261         int sub_stripes;        /* sub_stripes info for map */
2262         int dev_stripes;        /* stripes per dev */
2263         int devs_max;           /* max devs to use */
2264         int devs_min;           /* min devs needed */
2265         int devs_increment;     /* ndevs has to be a multiple of this */
2266         int ncopies;            /* how many copies to data has */
2267         int ret;
2268         u64 max_stripe_size;
2269         u64 max_chunk_size;
2270         u64 stripe_size;
2271         u64 num_bytes;
2272         int ndevs;
2273         int i;
2274         int j;
2275
2276         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2277             (type & BTRFS_BLOCK_GROUP_DUP)) {
2278                 WARN_ON(1);
2279                 type &= ~BTRFS_BLOCK_GROUP_DUP;
2280         }
2281
2282         if (list_empty(&fs_devices->alloc_list))
2283                 return -ENOSPC;
2284
2285         sub_stripes = 1;
2286         dev_stripes = 1;
2287         devs_increment = 1;
2288         ncopies = 1;
2289         devs_max = 0;   /* 0 == as many as possible */
2290         devs_min = 1;
2291
2292         /*
2293          * define the properties of each RAID type.
2294          * FIXME: move this to a global table and use it in all RAID
2295          * calculation code
2296          */
2297         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2298                 dev_stripes = 2;
2299                 ncopies = 2;
2300                 devs_max = 1;
2301         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2302                 devs_min = 2;
2303         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2304                 devs_increment = 2;
2305                 ncopies = 2;
2306                 devs_max = 2;
2307                 devs_min = 2;
2308         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2309                 sub_stripes = 2;
2310                 devs_increment = 2;
2311                 ncopies = 2;
2312                 devs_min = 4;
2313         } else {
2314                 devs_max = 1;
2315         }
2316
2317         if (type & BTRFS_BLOCK_GROUP_DATA) {
2318                 max_stripe_size = 1024 * 1024 * 1024;
2319                 max_chunk_size = 10 * max_stripe_size;
2320         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2321                 max_stripe_size = 256 * 1024 * 1024;
2322                 max_chunk_size = max_stripe_size;
2323         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2324                 max_stripe_size = 8 * 1024 * 1024;
2325                 max_chunk_size = 2 * max_stripe_size;
2326         } else {
2327                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
2328                        type);
2329                 BUG_ON(1);
2330         }
2331
2332         /* we don't want a chunk larger than 10% of writeable space */
2333         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2334                              max_chunk_size);
2335
2336         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2337                                GFP_NOFS);
2338         if (!devices_info)
2339                 return -ENOMEM;
2340
2341         cur = fs_devices->alloc_list.next;
2342
2343         /*
2344          * in the first pass through the devices list, we gather information
2345          * about the available holes on each device.
2346          */
2347         ndevs = 0;
2348         while (cur != &fs_devices->alloc_list) {
2349                 struct btrfs_device *device;
2350                 u64 max_avail;
2351                 u64 dev_offset;
2352
2353                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2354
2355                 cur = cur->next;
2356
2357                 if (!device->writeable) {
2358                         printk(KERN_ERR
2359                                "btrfs: read-only device in alloc_list\n");
2360                         WARN_ON(1);
2361                         continue;
2362                 }
2363
2364                 if (!device->in_fs_metadata)
2365                         continue;
2366
2367                 if (device->total_bytes > device->bytes_used)
2368                         total_avail = device->total_bytes - device->bytes_used;
2369                 else
2370                         total_avail = 0;
2371                 /* avail is off by max(alloc_start, 1MB), but that is the same
2372                  * for all devices, so it doesn't hurt the sorting later on
2373                  */
2374
2375                 ret = find_free_dev_extent(trans, device,
2376                                            max_stripe_size * dev_stripes,
2377                                            &dev_offset, &max_avail);
2378                 if (ret && ret != -ENOSPC)
2379                         goto error;
2380
2381                 if (ret == 0)
2382                         max_avail = max_stripe_size * dev_stripes;
2383
2384                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
2385                         continue;
2386
2387                 devices_info[ndevs].dev_offset = dev_offset;
2388                 devices_info[ndevs].max_avail = max_avail;
2389                 devices_info[ndevs].total_avail = total_avail;
2390                 devices_info[ndevs].dev = device;
2391                 ++ndevs;
2392         }
2393
2394         /*
2395          * now sort the devices by hole size / available space
2396          */
2397         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
2398              btrfs_cmp_device_info, NULL);
2399
2400         /* round down to number of usable stripes */
2401         ndevs -= ndevs % devs_increment;
2402
2403         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
2404                 ret = -ENOSPC;
2405                 goto error;
2406         }
2407
2408         if (devs_max && ndevs > devs_max)
2409                 ndevs = devs_max;
2410         /*
2411          * the primary goal is to maximize the number of stripes, so use as many
2412          * devices as possible, even if the stripes are not maximum sized.
2413          */
2414         stripe_size = devices_info[ndevs-1].max_avail;
2415         num_stripes = ndevs * dev_stripes;
2416
2417         if (stripe_size * num_stripes > max_chunk_size * ncopies) {
2418                 stripe_size = max_chunk_size * ncopies;
2419                 do_div(stripe_size, num_stripes);
2420         }
2421
2422         do_div(stripe_size, dev_stripes);
2423         do_div(stripe_size, BTRFS_STRIPE_LEN);
2424         stripe_size *= BTRFS_STRIPE_LEN;
2425
2426         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2427         if (!map) {
2428                 ret = -ENOMEM;
2429                 goto error;
2430         }
2431         map->num_stripes = num_stripes;
2432
2433         for (i = 0; i < ndevs; ++i) {
2434                 for (j = 0; j < dev_stripes; ++j) {
2435                         int s = i * dev_stripes + j;
2436                         map->stripes[s].dev = devices_info[i].dev;
2437                         map->stripes[s].physical = devices_info[i].dev_offset +
2438                                                    j * stripe_size;
2439                 }
2440         }
2441         map->sector_size = extent_root->sectorsize;
2442         map->stripe_len = BTRFS_STRIPE_LEN;
2443         map->io_align = BTRFS_STRIPE_LEN;
2444         map->io_width = BTRFS_STRIPE_LEN;
2445         map->type = type;
2446         map->sub_stripes = sub_stripes;
2447
2448         *map_ret = map;
2449         num_bytes = stripe_size * (num_stripes / ncopies);
2450
2451         *stripe_size_out = stripe_size;
2452         *num_bytes_out = num_bytes;
2453
2454         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
2455
2456         em = alloc_extent_map();
2457         if (!em) {
2458                 ret = -ENOMEM;
2459                 goto error;
2460         }
2461         em->bdev = (struct block_device *)map;
2462         em->start = start;
2463         em->len = num_bytes;
2464         em->block_start = 0;
2465         em->block_len = em->len;
2466
2467         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2468         write_lock(&em_tree->lock);
2469         ret = add_extent_mapping(em_tree, em);
2470         write_unlock(&em_tree->lock);
2471         BUG_ON(ret);
2472         free_extent_map(em);
2473
2474         ret = btrfs_make_block_group(trans, extent_root, 0, type,
2475                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2476                                      start, num_bytes);
2477         BUG_ON(ret);
2478
2479         for (i = 0; i < map->num_stripes; ++i) {
2480                 struct btrfs_device *device;
2481                 u64 dev_offset;
2482
2483                 device = map->stripes[i].dev;
2484                 dev_offset = map->stripes[i].physical;
2485
2486                 ret = btrfs_alloc_dev_extent(trans, device,
2487                                 info->chunk_root->root_key.objectid,
2488                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2489                                 start, dev_offset, stripe_size);
2490                 BUG_ON(ret);
2491         }
2492
2493         kfree(devices_info);
2494         return 0;
2495
2496 error:
2497         kfree(map);
2498         kfree(devices_info);
2499         return ret;
2500 }
2501
2502 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2503                                 struct btrfs_root *extent_root,
2504                                 struct map_lookup *map, u64 chunk_offset,
2505                                 u64 chunk_size, u64 stripe_size)
2506 {
2507         u64 dev_offset;
2508         struct btrfs_key key;
2509         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2510         struct btrfs_device *device;
2511         struct btrfs_chunk *chunk;
2512         struct btrfs_stripe *stripe;
2513         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2514         int index = 0;
2515         int ret;
2516
2517         chunk = kzalloc(item_size, GFP_NOFS);
2518         if (!chunk)
2519                 return -ENOMEM;
2520
2521         index = 0;
2522         while (index < map->num_stripes) {
2523                 device = map->stripes[index].dev;
2524                 device->bytes_used += stripe_size;
2525                 ret = btrfs_update_device(trans, device);
2526                 BUG_ON(ret);
2527                 index++;
2528         }
2529
2530         index = 0;
2531         stripe = &chunk->stripe;
2532         while (index < map->num_stripes) {
2533                 device = map->stripes[index].dev;
2534                 dev_offset = map->stripes[index].physical;
2535
2536                 btrfs_set_stack_stripe_devid(stripe, device->devid);
2537                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2538                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2539                 stripe++;
2540                 index++;
2541         }
2542
2543         btrfs_set_stack_chunk_length(chunk, chunk_size);
2544         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2545         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2546         btrfs_set_stack_chunk_type(chunk, map->type);
2547         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2548         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2549         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2550         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2551         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2552
2553         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2554         key.type = BTRFS_CHUNK_ITEM_KEY;
2555         key.offset = chunk_offset;
2556
2557         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2558         BUG_ON(ret);
2559
2560         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2561                 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2562                                              item_size);
2563                 BUG_ON(ret);
2564         }
2565
2566         kfree(chunk);
2567         return 0;
2568 }
2569
2570 /*
2571  * Chunk allocation falls into two parts. The first part does works
2572  * that make the new allocated chunk useable, but not do any operation
2573  * that modifies the chunk tree. The second part does the works that
2574  * require modifying the chunk tree. This division is important for the
2575  * bootstrap process of adding storage to a seed btrfs.
2576  */
2577 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2578                       struct btrfs_root *extent_root, u64 type)
2579 {
2580         u64 chunk_offset;
2581         u64 chunk_size;
2582         u64 stripe_size;
2583         struct map_lookup *map;
2584         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2585         int ret;
2586
2587         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2588                               &chunk_offset);
2589         if (ret)
2590                 return ret;
2591
2592         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2593                                   &stripe_size, chunk_offset, type);
2594         if (ret)
2595                 return ret;
2596
2597         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2598                                    chunk_size, stripe_size);
2599         BUG_ON(ret);
2600         return 0;
2601 }
2602
2603 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2604                                          struct btrfs_root *root,
2605                                          struct btrfs_device *device)
2606 {
2607         u64 chunk_offset;
2608         u64 sys_chunk_offset;
2609         u64 chunk_size;
2610         u64 sys_chunk_size;
2611         u64 stripe_size;
2612         u64 sys_stripe_size;
2613         u64 alloc_profile;
2614         struct map_lookup *map;
2615         struct map_lookup *sys_map;
2616         struct btrfs_fs_info *fs_info = root->fs_info;
2617         struct btrfs_root *extent_root = fs_info->extent_root;
2618         int ret;
2619
2620         ret = find_next_chunk(fs_info->chunk_root,
2621                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2622         BUG_ON(ret);
2623
2624         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2625                         (fs_info->metadata_alloc_profile &
2626                          fs_info->avail_metadata_alloc_bits);
2627         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2628
2629         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2630                                   &stripe_size, chunk_offset, alloc_profile);
2631         BUG_ON(ret);
2632
2633         sys_chunk_offset = chunk_offset + chunk_size;
2634
2635         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2636                         (fs_info->system_alloc_profile &
2637                          fs_info->avail_system_alloc_bits);
2638         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2639
2640         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2641                                   &sys_chunk_size, &sys_stripe_size,
2642                                   sys_chunk_offset, alloc_profile);
2643         BUG_ON(ret);
2644
2645         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2646         BUG_ON(ret);
2647
2648         /*
2649          * Modifying chunk tree needs allocating new blocks from both
2650          * system block group and metadata block group. So we only can
2651          * do operations require modifying the chunk tree after both
2652          * block groups were created.
2653          */
2654         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2655                                    chunk_size, stripe_size);
2656         BUG_ON(ret);
2657
2658         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2659                                    sys_chunk_offset, sys_chunk_size,
2660                                    sys_stripe_size);
2661         BUG_ON(ret);
2662         return 0;
2663 }
2664
2665 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2666 {
2667         struct extent_map *em;
2668         struct map_lookup *map;
2669         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2670         int readonly = 0;
2671         int i;
2672
2673         read_lock(&map_tree->map_tree.lock);
2674         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2675         read_unlock(&map_tree->map_tree.lock);
2676         if (!em)
2677                 return 1;
2678
2679         if (btrfs_test_opt(root, DEGRADED)) {
2680                 free_extent_map(em);
2681                 return 0;
2682         }
2683
2684         map = (struct map_lookup *)em->bdev;
2685         for (i = 0; i < map->num_stripes; i++) {
2686                 if (!map->stripes[i].dev->writeable) {
2687                         readonly = 1;
2688                         break;
2689                 }
2690         }
2691         free_extent_map(em);
2692         return readonly;
2693 }
2694
2695 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2696 {
2697         extent_map_tree_init(&tree->map_tree);
2698 }
2699
2700 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2701 {
2702         struct extent_map *em;
2703
2704         while (1) {
2705                 write_lock(&tree->map_tree.lock);
2706                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2707                 if (em)
2708                         remove_extent_mapping(&tree->map_tree, em);
2709                 write_unlock(&tree->map_tree.lock);
2710                 if (!em)
2711                         break;
2712                 kfree(em->bdev);
2713                 /* once for us */
2714                 free_extent_map(em);
2715                 /* once for the tree */
2716                 free_extent_map(em);
2717         }
2718 }
2719
2720 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2721 {
2722         struct extent_map *em;
2723         struct map_lookup *map;
2724         struct extent_map_tree *em_tree = &map_tree->map_tree;
2725         int ret;
2726
2727         read_lock(&em_tree->lock);
2728         em = lookup_extent_mapping(em_tree, logical, len);
2729         read_unlock(&em_tree->lock);
2730         BUG_ON(!em);
2731
2732         BUG_ON(em->start > logical || em->start + em->len < logical);
2733         map = (struct map_lookup *)em->bdev;
2734         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2735                 ret = map->num_stripes;
2736         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2737                 ret = map->sub_stripes;
2738         else
2739                 ret = 1;
2740         free_extent_map(em);
2741         return ret;
2742 }
2743
2744 static int find_live_mirror(struct map_lookup *map, int first, int num,
2745                             int optimal)
2746 {
2747         int i;
2748         if (map->stripes[optimal].dev->bdev)
2749                 return optimal;
2750         for (i = first; i < first + num; i++) {
2751                 if (map->stripes[i].dev->bdev)
2752                         return i;
2753         }
2754         /* we couldn't find one that doesn't fail.  Just return something
2755          * and the io error handling code will clean up eventually
2756          */
2757         return optimal;
2758 }
2759
2760 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2761                              u64 logical, u64 *length,
2762                              struct btrfs_multi_bio **multi_ret,
2763                              int mirror_num)
2764 {
2765         struct extent_map *em;
2766         struct map_lookup *map;
2767         struct extent_map_tree *em_tree = &map_tree->map_tree;
2768         u64 offset;
2769         u64 stripe_offset;
2770         u64 stripe_end_offset;
2771         u64 stripe_nr;
2772         u64 stripe_nr_orig;
2773         u64 stripe_nr_end;
2774         int stripes_allocated = 8;
2775         int stripes_required = 1;
2776         int stripe_index;
2777         int i;
2778         int num_stripes;
2779         int max_errors = 0;
2780         struct btrfs_multi_bio *multi = NULL;
2781
2782         if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2783                 stripes_allocated = 1;
2784 again:
2785         if (multi_ret) {
2786                 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2787                                 GFP_NOFS);
2788                 if (!multi)
2789                         return -ENOMEM;
2790
2791                 atomic_set(&multi->error, 0);
2792         }
2793
2794         read_lock(&em_tree->lock);
2795         em = lookup_extent_mapping(em_tree, logical, *length);
2796         read_unlock(&em_tree->lock);
2797
2798         if (!em) {
2799                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2800                        (unsigned long long)logical,
2801                        (unsigned long long)*length);
2802                 BUG();
2803         }
2804
2805         BUG_ON(em->start > logical || em->start + em->len < logical);
2806         map = (struct map_lookup *)em->bdev;
2807         offset = logical - em->start;
2808
2809         if (mirror_num > map->num_stripes)
2810                 mirror_num = 0;
2811
2812         /* if our multi bio struct is too small, back off and try again */
2813         if (rw & REQ_WRITE) {
2814                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2815                                  BTRFS_BLOCK_GROUP_DUP)) {
2816                         stripes_required = map->num_stripes;
2817                         max_errors = 1;
2818                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2819                         stripes_required = map->sub_stripes;
2820                         max_errors = 1;
2821                 }
2822         }
2823         if (rw & REQ_DISCARD) {
2824                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2825                                  BTRFS_BLOCK_GROUP_RAID1 |
2826                                  BTRFS_BLOCK_GROUP_DUP |
2827                                  BTRFS_BLOCK_GROUP_RAID10)) {
2828                         stripes_required = map->num_stripes;
2829                 }
2830         }
2831         if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2832             stripes_allocated < stripes_required) {
2833                 stripes_allocated = map->num_stripes;
2834                 free_extent_map(em);
2835                 kfree(multi);
2836                 goto again;
2837         }
2838         stripe_nr = offset;
2839         /*
2840          * stripe_nr counts the total number of stripes we have to stride
2841          * to get to this block
2842          */
2843         do_div(stripe_nr, map->stripe_len);
2844
2845         stripe_offset = stripe_nr * map->stripe_len;
2846         BUG_ON(offset < stripe_offset);
2847
2848         /* stripe_offset is the offset of this block in its stripe*/
2849         stripe_offset = offset - stripe_offset;
2850
2851         if (rw & REQ_DISCARD)
2852                 *length = min_t(u64, em->len - offset, *length);
2853         else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2854                               BTRFS_BLOCK_GROUP_RAID1 |
2855                               BTRFS_BLOCK_GROUP_RAID10 |
2856                               BTRFS_BLOCK_GROUP_DUP)) {
2857                 /* we limit the length of each bio to what fits in a stripe */
2858                 *length = min_t(u64, em->len - offset,
2859                                 map->stripe_len - stripe_offset);
2860         } else {
2861                 *length = em->len - offset;
2862         }
2863
2864         if (!multi_ret)
2865                 goto out;
2866
2867         num_stripes = 1;
2868         stripe_index = 0;
2869         stripe_nr_orig = stripe_nr;
2870         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
2871                         (~(map->stripe_len - 1));
2872         do_div(stripe_nr_end, map->stripe_len);
2873         stripe_end_offset = stripe_nr_end * map->stripe_len -
2874                             (offset + *length);
2875         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2876                 if (rw & REQ_DISCARD)
2877                         num_stripes = min_t(u64, map->num_stripes,
2878                                             stripe_nr_end - stripe_nr_orig);
2879                 stripe_index = do_div(stripe_nr, map->num_stripes);
2880         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2881                 if (rw & (REQ_WRITE | REQ_DISCARD))
2882                         num_stripes = map->num_stripes;
2883                 else if (mirror_num)
2884                         stripe_index = mirror_num - 1;
2885                 else {
2886                         stripe_index = find_live_mirror(map, 0,
2887                                             map->num_stripes,
2888                                             current->pid % map->num_stripes);
2889                 }
2890
2891         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2892                 if (rw & (REQ_WRITE | REQ_DISCARD))
2893                         num_stripes = map->num_stripes;
2894                 else if (mirror_num)
2895                         stripe_index = mirror_num - 1;
2896
2897         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2898                 int factor = map->num_stripes / map->sub_stripes;
2899
2900                 stripe_index = do_div(stripe_nr, factor);
2901                 stripe_index *= map->sub_stripes;
2902
2903                 if (rw & REQ_WRITE)
2904                         num_stripes = map->sub_stripes;
2905                 else if (rw & REQ_DISCARD)
2906                         num_stripes = min_t(u64, map->sub_stripes *
2907                                             (stripe_nr_end - stripe_nr_orig),
2908                                             map->num_stripes);
2909                 else if (mirror_num)
2910                         stripe_index += mirror_num - 1;
2911                 else {
2912                         stripe_index = find_live_mirror(map, stripe_index,
2913                                               map->sub_stripes, stripe_index +
2914                                               current->pid % map->sub_stripes);
2915                 }
2916         } else {
2917                 /*
2918                  * after this do_div call, stripe_nr is the number of stripes
2919                  * on this device we have to walk to find the data, and
2920                  * stripe_index is the number of our device in the stripe array
2921                  */
2922                 stripe_index = do_div(stripe_nr, map->num_stripes);
2923         }
2924         BUG_ON(stripe_index >= map->num_stripes);
2925
2926         if (rw & REQ_DISCARD) {
2927                 for (i = 0; i < num_stripes; i++) {
2928                         multi->stripes[i].physical =
2929                                 map->stripes[stripe_index].physical +
2930                                 stripe_offset + stripe_nr * map->stripe_len;
2931                         multi->stripes[i].dev = map->stripes[stripe_index].dev;
2932
2933                         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2934                                 u64 stripes;
2935                                 u32 last_stripe = 0;
2936                                 int j;
2937
2938                                 div_u64_rem(stripe_nr_end - 1,
2939                                             map->num_stripes,
2940                                             &last_stripe);
2941
2942                                 for (j = 0; j < map->num_stripes; j++) {
2943                                         u32 test;
2944
2945                                         div_u64_rem(stripe_nr_end - 1 - j,
2946                                                     map->num_stripes, &test);
2947                                         if (test == stripe_index)
2948                                                 break;
2949                                 }
2950                                 stripes = stripe_nr_end - 1 - j;
2951                                 do_div(stripes, map->num_stripes);
2952                                 multi->stripes[i].length = map->stripe_len *
2953                                         (stripes - stripe_nr + 1);
2954
2955                                 if (i == 0) {
2956                                         multi->stripes[i].length -=
2957                                                 stripe_offset;
2958                                         stripe_offset = 0;
2959                                 }
2960                                 if (stripe_index == last_stripe)
2961                                         multi->stripes[i].length -=
2962                                                 stripe_end_offset;
2963                         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2964                                 u64 stripes;
2965                                 int j;
2966                                 int factor = map->num_stripes /
2967                                              map->sub_stripes;
2968                                 u32 last_stripe = 0;
2969
2970                                 div_u64_rem(stripe_nr_end - 1,
2971                                             factor, &last_stripe);
2972                                 last_stripe *= map->sub_stripes;
2973
2974                                 for (j = 0; j < factor; j++) {
2975                                         u32 test;
2976
2977                                         div_u64_rem(stripe_nr_end - 1 - j,
2978                                                     factor, &test);
2979
2980                                         if (test ==
2981                                             stripe_index / map->sub_stripes)
2982                                                 break;
2983                                 }
2984                                 stripes = stripe_nr_end - 1 - j;
2985                                 do_div(stripes, factor);
2986                                 multi->stripes[i].length = map->stripe_len *
2987                                         (stripes - stripe_nr + 1);
2988
2989                                 if (i < map->sub_stripes) {
2990                                         multi->stripes[i].length -=
2991                                                 stripe_offset;
2992                                         if (i == map->sub_stripes - 1)
2993                                                 stripe_offset = 0;
2994                                 }
2995                                 if (stripe_index >= last_stripe &&
2996                                     stripe_index <= (last_stripe +
2997                                                      map->sub_stripes - 1)) {
2998                                         multi->stripes[i].length -=
2999                                                 stripe_end_offset;
3000                                 }
3001                         } else
3002                                 multi->stripes[i].length = *length;
3003
3004                         stripe_index++;
3005                         if (stripe_index == map->num_stripes) {
3006                                 /* This could only happen for RAID0/10 */
3007                                 stripe_index = 0;
3008                                 stripe_nr++;
3009                         }
3010                 }
3011         } else {
3012                 for (i = 0; i < num_stripes; i++) {
3013                         multi->stripes[i].physical =
3014                                 map->stripes[stripe_index].physical +
3015                                 stripe_offset +
3016                                 stripe_nr * map->stripe_len;
3017                         multi->stripes[i].dev =
3018                                 map->stripes[stripe_index].dev;
3019                         stripe_index++;
3020                 }
3021         }
3022         if (multi_ret) {
3023                 *multi_ret = multi;
3024                 multi->num_stripes = num_stripes;
3025                 multi->max_errors = max_errors;
3026         }
3027 out:
3028         free_extent_map(em);
3029         return 0;
3030 }
3031
3032 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3033                       u64 logical, u64 *length,
3034                       struct btrfs_multi_bio **multi_ret, int mirror_num)
3035 {
3036         return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3037                                  mirror_num);
3038 }
3039
3040 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3041                      u64 chunk_start, u64 physical, u64 devid,
3042                      u64 **logical, int *naddrs, int *stripe_len)
3043 {
3044         struct extent_map_tree *em_tree = &map_tree->map_tree;
3045         struct extent_map *em;
3046         struct map_lookup *map;
3047         u64 *buf;
3048         u64 bytenr;
3049         u64 length;
3050         u64 stripe_nr;
3051         int i, j, nr = 0;
3052
3053         read_lock(&em_tree->lock);
3054         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3055         read_unlock(&em_tree->lock);
3056
3057         BUG_ON(!em || em->start != chunk_start);
3058         map = (struct map_lookup *)em->bdev;
3059
3060         length = em->len;
3061         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3062                 do_div(length, map->num_stripes / map->sub_stripes);
3063         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3064                 do_div(length, map->num_stripes);
3065
3066         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3067         BUG_ON(!buf);
3068
3069         for (i = 0; i < map->num_stripes; i++) {
3070                 if (devid && map->stripes[i].dev->devid != devid)
3071                         continue;
3072                 if (map->stripes[i].physical > physical ||
3073                     map->stripes[i].physical + length <= physical)
3074                         continue;
3075
3076                 stripe_nr = physical - map->stripes[i].physical;
3077                 do_div(stripe_nr, map->stripe_len);
3078
3079                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3080                         stripe_nr = stripe_nr * map->num_stripes + i;
3081                         do_div(stripe_nr, map->sub_stripes);
3082                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3083                         stripe_nr = stripe_nr * map->num_stripes + i;
3084                 }
3085                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3086                 WARN_ON(nr >= map->num_stripes);
3087                 for (j = 0; j < nr; j++) {
3088                         if (buf[j] == bytenr)
3089                                 break;
3090                 }
3091                 if (j == nr) {
3092                         WARN_ON(nr >= map->num_stripes);
3093                         buf[nr++] = bytenr;
3094                 }
3095         }
3096
3097         *logical = buf;
3098         *naddrs = nr;
3099         *stripe_len = map->stripe_len;
3100
3101         free_extent_map(em);
3102         return 0;
3103 }
3104
3105 static void end_bio_multi_stripe(struct bio *bio, int err)
3106 {
3107         struct btrfs_multi_bio *multi = bio->bi_private;
3108         int is_orig_bio = 0;
3109
3110         if (err)
3111                 atomic_inc(&multi->error);
3112
3113         if (bio == multi->orig_bio)
3114                 is_orig_bio = 1;
3115
3116         if (atomic_dec_and_test(&multi->stripes_pending)) {
3117                 if (!is_orig_bio) {
3118                         bio_put(bio);
3119                         bio = multi->orig_bio;
3120                 }
3121                 bio->bi_private = multi->private;
3122                 bio->bi_end_io = multi->end_io;
3123                 /* only send an error to the higher layers if it is
3124                  * beyond the tolerance of the multi-bio
3125                  */
3126                 if (atomic_read(&multi->error) > multi->max_errors) {
3127                         err = -EIO;
3128                 } else if (err) {
3129                         /*
3130                          * this bio is actually up to date, we didn't
3131                          * go over the max number of errors
3132                          */
3133                         set_bit(BIO_UPTODATE, &bio->bi_flags);
3134                         err = 0;
3135                 }
3136                 kfree(multi);
3137
3138                 bio_endio(bio, err);
3139         } else if (!is_orig_bio) {
3140                 bio_put(bio);
3141         }
3142 }
3143
3144 struct async_sched {
3145         struct bio *bio;
3146         int rw;
3147         struct btrfs_fs_info *info;
3148         struct btrfs_work work;
3149 };
3150
3151 /*
3152  * see run_scheduled_bios for a description of why bios are collected for
3153  * async submit.
3154  *
3155  * This will add one bio to the pending list for a device and make sure
3156  * the work struct is scheduled.
3157  */
3158 static noinline int schedule_bio(struct btrfs_root *root,
3159                                  struct btrfs_device *device,
3160                                  int rw, struct bio *bio)
3161 {
3162         int should_queue = 1;
3163         struct btrfs_pending_bios *pending_bios;
3164
3165         /* don't bother with additional async steps for reads, right now */
3166         if (!(rw & REQ_WRITE)) {
3167                 bio_get(bio);
3168                 submit_bio(rw, bio);
3169                 bio_put(bio);
3170                 return 0;
3171         }
3172
3173         /*
3174          * nr_async_bios allows us to reliably return congestion to the
3175          * higher layers.  Otherwise, the async bio makes it appear we have
3176          * made progress against dirty pages when we've really just put it
3177          * on a queue for later
3178          */
3179         atomic_inc(&root->fs_info->nr_async_bios);
3180         WARN_ON(bio->bi_next);
3181         bio->bi_next = NULL;
3182         bio->bi_rw |= rw;
3183
3184         spin_lock(&device->io_lock);
3185         if (bio->bi_rw & REQ_SYNC)
3186                 pending_bios = &device->pending_sync_bios;
3187         else
3188                 pending_bios = &device->pending_bios;
3189
3190         if (pending_bios->tail)
3191                 pending_bios->tail->bi_next = bio;
3192
3193         pending_bios->tail = bio;
3194         if (!pending_bios->head)
3195                 pending_bios->head = bio;
3196         if (device->running_pending)
3197                 should_queue = 0;
3198
3199         spin_unlock(&device->io_lock);
3200
3201         if (should_queue)
3202                 btrfs_queue_worker(&root->fs_info->submit_workers,
3203                                    &device->work);
3204         return 0;
3205 }
3206
3207 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3208                   int mirror_num, int async_submit)
3209 {
3210         struct btrfs_mapping_tree *map_tree;
3211         struct btrfs_device *dev;
3212         struct bio *first_bio = bio;
3213         u64 logical = (u64)bio->bi_sector << 9;