btrfs: move the space_info handling code to space-info.c
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "tree-log.h"
20 #include "disk-io.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "locking.h"
25 #include "free-space-cache.h"
26 #include "free-space-tree.h"
27 #include "math.h"
28 #include "sysfs.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32
33 #undef SCRAMBLE_DELAYED_REFS
34
35 /*
36  * Declare a helper function to detect underflow of various space info members
37  */
38 #define DECLARE_SPACE_INFO_UPDATE(name)                                 \
39 static inline void update_##name(struct btrfs_fs_info *fs_info,         \
40                                  struct btrfs_space_info *sinfo,        \
41                                  s64 bytes)                             \
42 {                                                                       \
43         lockdep_assert_held(&sinfo->lock);                              \
44         trace_update_##name(fs_info, sinfo, sinfo->name, bytes);        \
45         if (bytes < 0 && sinfo->name < -bytes) {                        \
46                 WARN_ON(1);                                             \
47                 sinfo->name = 0;                                        \
48                 return;                                                 \
49         }                                                               \
50         sinfo->name += bytes;                                           \
51 }
52
53 DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
54 DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
55
56 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
57                                struct btrfs_delayed_ref_node *node, u64 parent,
58                                u64 root_objectid, u64 owner_objectid,
59                                u64 owner_offset, int refs_to_drop,
60                                struct btrfs_delayed_extent_op *extra_op);
61 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
62                                     struct extent_buffer *leaf,
63                                     struct btrfs_extent_item *ei);
64 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
65                                       u64 parent, u64 root_objectid,
66                                       u64 flags, u64 owner, u64 offset,
67                                       struct btrfs_key *ins, int ref_mod);
68 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
69                                      struct btrfs_delayed_ref_node *node,
70                                      struct btrfs_delayed_extent_op *extent_op);
71 static int find_next_key(struct btrfs_path *path, int level,
72                          struct btrfs_key *key);
73 static void dump_space_info(struct btrfs_fs_info *fs_info,
74                             struct btrfs_space_info *info, u64 bytes,
75                             int dump_block_groups);
76 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
77                                u64 num_bytes);
78
79 static noinline int
80 block_group_cache_done(struct btrfs_block_group_cache *cache)
81 {
82         smp_mb();
83         return cache->cached == BTRFS_CACHE_FINISHED ||
84                 cache->cached == BTRFS_CACHE_ERROR;
85 }
86
87 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
88 {
89         return (cache->flags & bits) == bits;
90 }
91
92 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
93 {
94         atomic_inc(&cache->count);
95 }
96
97 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
98 {
99         if (atomic_dec_and_test(&cache->count)) {
100                 WARN_ON(cache->pinned > 0);
101                 WARN_ON(cache->reserved > 0);
102
103                 /*
104                  * If not empty, someone is still holding mutex of
105                  * full_stripe_lock, which can only be released by caller.
106                  * And it will definitely cause use-after-free when caller
107                  * tries to release full stripe lock.
108                  *
109                  * No better way to resolve, but only to warn.
110                  */
111                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
112                 kfree(cache->free_space_ctl);
113                 kfree(cache);
114         }
115 }
116
117 /*
118  * this adds the block group to the fs_info rb tree for the block group
119  * cache
120  */
121 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
122                                 struct btrfs_block_group_cache *block_group)
123 {
124         struct rb_node **p;
125         struct rb_node *parent = NULL;
126         struct btrfs_block_group_cache *cache;
127
128         spin_lock(&info->block_group_cache_lock);
129         p = &info->block_group_cache_tree.rb_node;
130
131         while (*p) {
132                 parent = *p;
133                 cache = rb_entry(parent, struct btrfs_block_group_cache,
134                                  cache_node);
135                 if (block_group->key.objectid < cache->key.objectid) {
136                         p = &(*p)->rb_left;
137                 } else if (block_group->key.objectid > cache->key.objectid) {
138                         p = &(*p)->rb_right;
139                 } else {
140                         spin_unlock(&info->block_group_cache_lock);
141                         return -EEXIST;
142                 }
143         }
144
145         rb_link_node(&block_group->cache_node, parent, p);
146         rb_insert_color(&block_group->cache_node,
147                         &info->block_group_cache_tree);
148
149         if (info->first_logical_byte > block_group->key.objectid)
150                 info->first_logical_byte = block_group->key.objectid;
151
152         spin_unlock(&info->block_group_cache_lock);
153
154         return 0;
155 }
156
157 /*
158  * This will return the block group at or after bytenr if contains is 0, else
159  * it will return the block group that contains the bytenr
160  */
161 static struct btrfs_block_group_cache *
162 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
163                               int contains)
164 {
165         struct btrfs_block_group_cache *cache, *ret = NULL;
166         struct rb_node *n;
167         u64 end, start;
168
169         spin_lock(&info->block_group_cache_lock);
170         n = info->block_group_cache_tree.rb_node;
171
172         while (n) {
173                 cache = rb_entry(n, struct btrfs_block_group_cache,
174                                  cache_node);
175                 end = cache->key.objectid + cache->key.offset - 1;
176                 start = cache->key.objectid;
177
178                 if (bytenr < start) {
179                         if (!contains && (!ret || start < ret->key.objectid))
180                                 ret = cache;
181                         n = n->rb_left;
182                 } else if (bytenr > start) {
183                         if (contains && bytenr <= end) {
184                                 ret = cache;
185                                 break;
186                         }
187                         n = n->rb_right;
188                 } else {
189                         ret = cache;
190                         break;
191                 }
192         }
193         if (ret) {
194                 btrfs_get_block_group(ret);
195                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
196                         info->first_logical_byte = ret->key.objectid;
197         }
198         spin_unlock(&info->block_group_cache_lock);
199
200         return ret;
201 }
202
203 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
204                                u64 start, u64 num_bytes)
205 {
206         u64 end = start + num_bytes - 1;
207         set_extent_bits(&fs_info->freed_extents[0],
208                         start, end, EXTENT_UPTODATE);
209         set_extent_bits(&fs_info->freed_extents[1],
210                         start, end, EXTENT_UPTODATE);
211         return 0;
212 }
213
214 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
215 {
216         struct btrfs_fs_info *fs_info = cache->fs_info;
217         u64 start, end;
218
219         start = cache->key.objectid;
220         end = start + cache->key.offset - 1;
221
222         clear_extent_bits(&fs_info->freed_extents[0],
223                           start, end, EXTENT_UPTODATE);
224         clear_extent_bits(&fs_info->freed_extents[1],
225                           start, end, EXTENT_UPTODATE);
226 }
227
228 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
229 {
230         struct btrfs_fs_info *fs_info = cache->fs_info;
231         u64 bytenr;
232         u64 *logical;
233         int stripe_len;
234         int i, nr, ret;
235
236         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
237                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
238                 cache->bytes_super += stripe_len;
239                 ret = add_excluded_extent(fs_info, cache->key.objectid,
240                                           stripe_len);
241                 if (ret)
242                         return ret;
243         }
244
245         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
246                 bytenr = btrfs_sb_offset(i);
247                 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
248                                        bytenr, &logical, &nr, &stripe_len);
249                 if (ret)
250                         return ret;
251
252                 while (nr--) {
253                         u64 start, len;
254
255                         if (logical[nr] > cache->key.objectid +
256                             cache->key.offset)
257                                 continue;
258
259                         if (logical[nr] + stripe_len <= cache->key.objectid)
260                                 continue;
261
262                         start = logical[nr];
263                         if (start < cache->key.objectid) {
264                                 start = cache->key.objectid;
265                                 len = (logical[nr] + stripe_len) - start;
266                         } else {
267                                 len = min_t(u64, stripe_len,
268                                             cache->key.objectid +
269                                             cache->key.offset - start);
270                         }
271
272                         cache->bytes_super += len;
273                         ret = add_excluded_extent(fs_info, start, len);
274                         if (ret) {
275                                 kfree(logical);
276                                 return ret;
277                         }
278                 }
279
280                 kfree(logical);
281         }
282         return 0;
283 }
284
285 static struct btrfs_caching_control *
286 get_caching_control(struct btrfs_block_group_cache *cache)
287 {
288         struct btrfs_caching_control *ctl;
289
290         spin_lock(&cache->lock);
291         if (!cache->caching_ctl) {
292                 spin_unlock(&cache->lock);
293                 return NULL;
294         }
295
296         ctl = cache->caching_ctl;
297         refcount_inc(&ctl->count);
298         spin_unlock(&cache->lock);
299         return ctl;
300 }
301
302 static void put_caching_control(struct btrfs_caching_control *ctl)
303 {
304         if (refcount_dec_and_test(&ctl->count))
305                 kfree(ctl);
306 }
307
308 #ifdef CONFIG_BTRFS_DEBUG
309 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
310 {
311         struct btrfs_fs_info *fs_info = block_group->fs_info;
312         u64 start = block_group->key.objectid;
313         u64 len = block_group->key.offset;
314         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
315                 fs_info->nodesize : fs_info->sectorsize;
316         u64 step = chunk << 1;
317
318         while (len > chunk) {
319                 btrfs_remove_free_space(block_group, start, chunk);
320                 start += step;
321                 if (len < step)
322                         len = 0;
323                 else
324                         len -= step;
325         }
326 }
327 #endif
328
329 /*
330  * this is only called by cache_block_group, since we could have freed extents
331  * we need to check the pinned_extents for any extents that can't be used yet
332  * since their free space will be released as soon as the transaction commits.
333  */
334 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
335                        u64 start, u64 end)
336 {
337         struct btrfs_fs_info *info = block_group->fs_info;
338         u64 extent_start, extent_end, size, total_added = 0;
339         int ret;
340
341         while (start < end) {
342                 ret = find_first_extent_bit(info->pinned_extents, start,
343                                             &extent_start, &extent_end,
344                                             EXTENT_DIRTY | EXTENT_UPTODATE,
345                                             NULL);
346                 if (ret)
347                         break;
348
349                 if (extent_start <= start) {
350                         start = extent_end + 1;
351                 } else if (extent_start > start && extent_start < end) {
352                         size = extent_start - start;
353                         total_added += size;
354                         ret = btrfs_add_free_space(block_group, start,
355                                                    size);
356                         BUG_ON(ret); /* -ENOMEM or logic error */
357                         start = extent_end + 1;
358                 } else {
359                         break;
360                 }
361         }
362
363         if (start < end) {
364                 size = end - start;
365                 total_added += size;
366                 ret = btrfs_add_free_space(block_group, start, size);
367                 BUG_ON(ret); /* -ENOMEM or logic error */
368         }
369
370         return total_added;
371 }
372
373 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
374 {
375         struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
376         struct btrfs_fs_info *fs_info = block_group->fs_info;
377         struct btrfs_root *extent_root = fs_info->extent_root;
378         struct btrfs_path *path;
379         struct extent_buffer *leaf;
380         struct btrfs_key key;
381         u64 total_found = 0;
382         u64 last = 0;
383         u32 nritems;
384         int ret;
385         bool wakeup = true;
386
387         path = btrfs_alloc_path();
388         if (!path)
389                 return -ENOMEM;
390
391         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
392
393 #ifdef CONFIG_BTRFS_DEBUG
394         /*
395          * If we're fragmenting we don't want to make anybody think we can
396          * allocate from this block group until we've had a chance to fragment
397          * the free space.
398          */
399         if (btrfs_should_fragment_free_space(block_group))
400                 wakeup = false;
401 #endif
402         /*
403          * We don't want to deadlock with somebody trying to allocate a new
404          * extent for the extent root while also trying to search the extent
405          * root to add free space.  So we skip locking and search the commit
406          * root, since its read-only
407          */
408         path->skip_locking = 1;
409         path->search_commit_root = 1;
410         path->reada = READA_FORWARD;
411
412         key.objectid = last;
413         key.offset = 0;
414         key.type = BTRFS_EXTENT_ITEM_KEY;
415
416 next:
417         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
418         if (ret < 0)
419                 goto out;
420
421         leaf = path->nodes[0];
422         nritems = btrfs_header_nritems(leaf);
423
424         while (1) {
425                 if (btrfs_fs_closing(fs_info) > 1) {
426                         last = (u64)-1;
427                         break;
428                 }
429
430                 if (path->slots[0] < nritems) {
431                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
432                 } else {
433                         ret = find_next_key(path, 0, &key);
434                         if (ret)
435                                 break;
436
437                         if (need_resched() ||
438                             rwsem_is_contended(&fs_info->commit_root_sem)) {
439                                 if (wakeup)
440                                         caching_ctl->progress = last;
441                                 btrfs_release_path(path);
442                                 up_read(&fs_info->commit_root_sem);
443                                 mutex_unlock(&caching_ctl->mutex);
444                                 cond_resched();
445                                 mutex_lock(&caching_ctl->mutex);
446                                 down_read(&fs_info->commit_root_sem);
447                                 goto next;
448                         }
449
450                         ret = btrfs_next_leaf(extent_root, path);
451                         if (ret < 0)
452                                 goto out;
453                         if (ret)
454                                 break;
455                         leaf = path->nodes[0];
456                         nritems = btrfs_header_nritems(leaf);
457                         continue;
458                 }
459
460                 if (key.objectid < last) {
461                         key.objectid = last;
462                         key.offset = 0;
463                         key.type = BTRFS_EXTENT_ITEM_KEY;
464
465                         if (wakeup)
466                                 caching_ctl->progress = last;
467                         btrfs_release_path(path);
468                         goto next;
469                 }
470
471                 if (key.objectid < block_group->key.objectid) {
472                         path->slots[0]++;
473                         continue;
474                 }
475
476                 if (key.objectid >= block_group->key.objectid +
477                     block_group->key.offset)
478                         break;
479
480                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
481                     key.type == BTRFS_METADATA_ITEM_KEY) {
482                         total_found += add_new_free_space(block_group, last,
483                                                           key.objectid);
484                         if (key.type == BTRFS_METADATA_ITEM_KEY)
485                                 last = key.objectid +
486                                         fs_info->nodesize;
487                         else
488                                 last = key.objectid + key.offset;
489
490                         if (total_found > CACHING_CTL_WAKE_UP) {
491                                 total_found = 0;
492                                 if (wakeup)
493                                         wake_up(&caching_ctl->wait);
494                         }
495                 }
496                 path->slots[0]++;
497         }
498         ret = 0;
499
500         total_found += add_new_free_space(block_group, last,
501                                           block_group->key.objectid +
502                                           block_group->key.offset);
503         caching_ctl->progress = (u64)-1;
504
505 out:
506         btrfs_free_path(path);
507         return ret;
508 }
509
510 static noinline void caching_thread(struct btrfs_work *work)
511 {
512         struct btrfs_block_group_cache *block_group;
513         struct btrfs_fs_info *fs_info;
514         struct btrfs_caching_control *caching_ctl;
515         int ret;
516
517         caching_ctl = container_of(work, struct btrfs_caching_control, work);
518         block_group = caching_ctl->block_group;
519         fs_info = block_group->fs_info;
520
521         mutex_lock(&caching_ctl->mutex);
522         down_read(&fs_info->commit_root_sem);
523
524         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
525                 ret = load_free_space_tree(caching_ctl);
526         else
527                 ret = load_extent_tree_free(caching_ctl);
528
529         spin_lock(&block_group->lock);
530         block_group->caching_ctl = NULL;
531         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
532         spin_unlock(&block_group->lock);
533
534 #ifdef CONFIG_BTRFS_DEBUG
535         if (btrfs_should_fragment_free_space(block_group)) {
536                 u64 bytes_used;
537
538                 spin_lock(&block_group->space_info->lock);
539                 spin_lock(&block_group->lock);
540                 bytes_used = block_group->key.offset -
541                         btrfs_block_group_used(&block_group->item);
542                 block_group->space_info->bytes_used += bytes_used >> 1;
543                 spin_unlock(&block_group->lock);
544                 spin_unlock(&block_group->space_info->lock);
545                 fragment_free_space(block_group);
546         }
547 #endif
548
549         caching_ctl->progress = (u64)-1;
550
551         up_read(&fs_info->commit_root_sem);
552         free_excluded_extents(block_group);
553         mutex_unlock(&caching_ctl->mutex);
554
555         wake_up(&caching_ctl->wait);
556
557         put_caching_control(caching_ctl);
558         btrfs_put_block_group(block_group);
559 }
560
561 static int cache_block_group(struct btrfs_block_group_cache *cache,
562                              int load_cache_only)
563 {
564         DEFINE_WAIT(wait);
565         struct btrfs_fs_info *fs_info = cache->fs_info;
566         struct btrfs_caching_control *caching_ctl;
567         int ret = 0;
568
569         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
570         if (!caching_ctl)
571                 return -ENOMEM;
572
573         INIT_LIST_HEAD(&caching_ctl->list);
574         mutex_init(&caching_ctl->mutex);
575         init_waitqueue_head(&caching_ctl->wait);
576         caching_ctl->block_group = cache;
577         caching_ctl->progress = cache->key.objectid;
578         refcount_set(&caching_ctl->count, 1);
579         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
580                         caching_thread, NULL, NULL);
581
582         spin_lock(&cache->lock);
583         /*
584          * This should be a rare occasion, but this could happen I think in the
585          * case where one thread starts to load the space cache info, and then
586          * some other thread starts a transaction commit which tries to do an
587          * allocation while the other thread is still loading the space cache
588          * info.  The previous loop should have kept us from choosing this block
589          * group, but if we've moved to the state where we will wait on caching
590          * block groups we need to first check if we're doing a fast load here,
591          * so we can wait for it to finish, otherwise we could end up allocating
592          * from a block group who's cache gets evicted for one reason or
593          * another.
594          */
595         while (cache->cached == BTRFS_CACHE_FAST) {
596                 struct btrfs_caching_control *ctl;
597
598                 ctl = cache->caching_ctl;
599                 refcount_inc(&ctl->count);
600                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
601                 spin_unlock(&cache->lock);
602
603                 schedule();
604
605                 finish_wait(&ctl->wait, &wait);
606                 put_caching_control(ctl);
607                 spin_lock(&cache->lock);
608         }
609
610         if (cache->cached != BTRFS_CACHE_NO) {
611                 spin_unlock(&cache->lock);
612                 kfree(caching_ctl);
613                 return 0;
614         }
615         WARN_ON(cache->caching_ctl);
616         cache->caching_ctl = caching_ctl;
617         cache->cached = BTRFS_CACHE_FAST;
618         spin_unlock(&cache->lock);
619
620         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
621                 mutex_lock(&caching_ctl->mutex);
622                 ret = load_free_space_cache(cache);
623
624                 spin_lock(&cache->lock);
625                 if (ret == 1) {
626                         cache->caching_ctl = NULL;
627                         cache->cached = BTRFS_CACHE_FINISHED;
628                         cache->last_byte_to_unpin = (u64)-1;
629                         caching_ctl->progress = (u64)-1;
630                 } else {
631                         if (load_cache_only) {
632                                 cache->caching_ctl = NULL;
633                                 cache->cached = BTRFS_CACHE_NO;
634                         } else {
635                                 cache->cached = BTRFS_CACHE_STARTED;
636                                 cache->has_caching_ctl = 1;
637                         }
638                 }
639                 spin_unlock(&cache->lock);
640 #ifdef CONFIG_BTRFS_DEBUG
641                 if (ret == 1 &&
642                     btrfs_should_fragment_free_space(cache)) {
643                         u64 bytes_used;
644
645                         spin_lock(&cache->space_info->lock);
646                         spin_lock(&cache->lock);
647                         bytes_used = cache->key.offset -
648                                 btrfs_block_group_used(&cache->item);
649                         cache->space_info->bytes_used += bytes_used >> 1;
650                         spin_unlock(&cache->lock);
651                         spin_unlock(&cache->space_info->lock);
652                         fragment_free_space(cache);
653                 }
654 #endif
655                 mutex_unlock(&caching_ctl->mutex);
656
657                 wake_up(&caching_ctl->wait);
658                 if (ret == 1) {
659                         put_caching_control(caching_ctl);
660                         free_excluded_extents(cache);
661                         return 0;
662                 }
663         } else {
664                 /*
665                  * We're either using the free space tree or no caching at all.
666                  * Set cached to the appropriate value and wakeup any waiters.
667                  */
668                 spin_lock(&cache->lock);
669                 if (load_cache_only) {
670                         cache->caching_ctl = NULL;
671                         cache->cached = BTRFS_CACHE_NO;
672                 } else {
673                         cache->cached = BTRFS_CACHE_STARTED;
674                         cache->has_caching_ctl = 1;
675                 }
676                 spin_unlock(&cache->lock);
677                 wake_up(&caching_ctl->wait);
678         }
679
680         if (load_cache_only) {
681                 put_caching_control(caching_ctl);
682                 return 0;
683         }
684
685         down_write(&fs_info->commit_root_sem);
686         refcount_inc(&caching_ctl->count);
687         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
688         up_write(&fs_info->commit_root_sem);
689
690         btrfs_get_block_group(cache);
691
692         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
693
694         return ret;
695 }
696
697 /*
698  * return the block group that starts at or after bytenr
699  */
700 static struct btrfs_block_group_cache *
701 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
702 {
703         return block_group_cache_tree_search(info, bytenr, 0);
704 }
705
706 /*
707  * return the block group that contains the given bytenr
708  */
709 struct btrfs_block_group_cache *btrfs_lookup_block_group(
710                                                  struct btrfs_fs_info *info,
711                                                  u64 bytenr)
712 {
713         return block_group_cache_tree_search(info, bytenr, 1);
714 }
715
716 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
717 {
718         if (ref->type == BTRFS_REF_METADATA) {
719                 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
720                         return BTRFS_BLOCK_GROUP_SYSTEM;
721                 else
722                         return BTRFS_BLOCK_GROUP_METADATA;
723         }
724         return BTRFS_BLOCK_GROUP_DATA;
725 }
726
727 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
728                              struct btrfs_ref *ref)
729 {
730         struct btrfs_space_info *space_info;
731         u64 flags = generic_ref_to_space_flags(ref);
732
733         space_info = btrfs_find_space_info(fs_info, flags);
734         ASSERT(space_info);
735         percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
736                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
737 }
738
739 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
740                              struct btrfs_ref *ref)
741 {
742         struct btrfs_space_info *space_info;
743         u64 flags = generic_ref_to_space_flags(ref);
744
745         space_info = btrfs_find_space_info(fs_info, flags);
746         ASSERT(space_info);
747         percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
748                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
749 }
750
751 /* simple helper to search for an existing data extent at a given offset */
752 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
753 {
754         int ret;
755         struct btrfs_key key;
756         struct btrfs_path *path;
757
758         path = btrfs_alloc_path();
759         if (!path)
760                 return -ENOMEM;
761
762         key.objectid = start;
763         key.offset = len;
764         key.type = BTRFS_EXTENT_ITEM_KEY;
765         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
766         btrfs_free_path(path);
767         return ret;
768 }
769
770 /*
771  * helper function to lookup reference count and flags of a tree block.
772  *
773  * the head node for delayed ref is used to store the sum of all the
774  * reference count modifications queued up in the rbtree. the head
775  * node may also store the extent flags to set. This way you can check
776  * to see what the reference count and extent flags would be if all of
777  * the delayed refs are not processed.
778  */
779 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
780                              struct btrfs_fs_info *fs_info, u64 bytenr,
781                              u64 offset, int metadata, u64 *refs, u64 *flags)
782 {
783         struct btrfs_delayed_ref_head *head;
784         struct btrfs_delayed_ref_root *delayed_refs;
785         struct btrfs_path *path;
786         struct btrfs_extent_item *ei;
787         struct extent_buffer *leaf;
788         struct btrfs_key key;
789         u32 item_size;
790         u64 num_refs;
791         u64 extent_flags;
792         int ret;
793
794         /*
795          * If we don't have skinny metadata, don't bother doing anything
796          * different
797          */
798         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
799                 offset = fs_info->nodesize;
800                 metadata = 0;
801         }
802
803         path = btrfs_alloc_path();
804         if (!path)
805                 return -ENOMEM;
806
807         if (!trans) {
808                 path->skip_locking = 1;
809                 path->search_commit_root = 1;
810         }
811
812 search_again:
813         key.objectid = bytenr;
814         key.offset = offset;
815         if (metadata)
816                 key.type = BTRFS_METADATA_ITEM_KEY;
817         else
818                 key.type = BTRFS_EXTENT_ITEM_KEY;
819
820         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
821         if (ret < 0)
822                 goto out_free;
823
824         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
825                 if (path->slots[0]) {
826                         path->slots[0]--;
827                         btrfs_item_key_to_cpu(path->nodes[0], &key,
828                                               path->slots[0]);
829                         if (key.objectid == bytenr &&
830                             key.type == BTRFS_EXTENT_ITEM_KEY &&
831                             key.offset == fs_info->nodesize)
832                                 ret = 0;
833                 }
834         }
835
836         if (ret == 0) {
837                 leaf = path->nodes[0];
838                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
839                 if (item_size >= sizeof(*ei)) {
840                         ei = btrfs_item_ptr(leaf, path->slots[0],
841                                             struct btrfs_extent_item);
842                         num_refs = btrfs_extent_refs(leaf, ei);
843                         extent_flags = btrfs_extent_flags(leaf, ei);
844                 } else {
845                         ret = -EINVAL;
846                         btrfs_print_v0_err(fs_info);
847                         if (trans)
848                                 btrfs_abort_transaction(trans, ret);
849                         else
850                                 btrfs_handle_fs_error(fs_info, ret, NULL);
851
852                         goto out_free;
853                 }
854
855                 BUG_ON(num_refs == 0);
856         } else {
857                 num_refs = 0;
858                 extent_flags = 0;
859                 ret = 0;
860         }
861
862         if (!trans)
863                 goto out;
864
865         delayed_refs = &trans->transaction->delayed_refs;
866         spin_lock(&delayed_refs->lock);
867         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
868         if (head) {
869                 if (!mutex_trylock(&head->mutex)) {
870                         refcount_inc(&head->refs);
871                         spin_unlock(&delayed_refs->lock);
872
873                         btrfs_release_path(path);
874
875                         /*
876                          * Mutex was contended, block until it's released and try
877                          * again
878                          */
879                         mutex_lock(&head->mutex);
880                         mutex_unlock(&head->mutex);
881                         btrfs_put_delayed_ref_head(head);
882                         goto search_again;
883                 }
884                 spin_lock(&head->lock);
885                 if (head->extent_op && head->extent_op->update_flags)
886                         extent_flags |= head->extent_op->flags_to_set;
887                 else
888                         BUG_ON(num_refs == 0);
889
890                 num_refs += head->ref_mod;
891                 spin_unlock(&head->lock);
892                 mutex_unlock(&head->mutex);
893         }
894         spin_unlock(&delayed_refs->lock);
895 out:
896         WARN_ON(num_refs == 0);
897         if (refs)
898                 *refs = num_refs;
899         if (flags)
900                 *flags = extent_flags;
901 out_free:
902         btrfs_free_path(path);
903         return ret;
904 }
905
906 /*
907  * Back reference rules.  Back refs have three main goals:
908  *
909  * 1) differentiate between all holders of references to an extent so that
910  *    when a reference is dropped we can make sure it was a valid reference
911  *    before freeing the extent.
912  *
913  * 2) Provide enough information to quickly find the holders of an extent
914  *    if we notice a given block is corrupted or bad.
915  *
916  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
917  *    maintenance.  This is actually the same as #2, but with a slightly
918  *    different use case.
919  *
920  * There are two kinds of back refs. The implicit back refs is optimized
921  * for pointers in non-shared tree blocks. For a given pointer in a block,
922  * back refs of this kind provide information about the block's owner tree
923  * and the pointer's key. These information allow us to find the block by
924  * b-tree searching. The full back refs is for pointers in tree blocks not
925  * referenced by their owner trees. The location of tree block is recorded
926  * in the back refs. Actually the full back refs is generic, and can be
927  * used in all cases the implicit back refs is used. The major shortcoming
928  * of the full back refs is its overhead. Every time a tree block gets
929  * COWed, we have to update back refs entry for all pointers in it.
930  *
931  * For a newly allocated tree block, we use implicit back refs for
932  * pointers in it. This means most tree related operations only involve
933  * implicit back refs. For a tree block created in old transaction, the
934  * only way to drop a reference to it is COW it. So we can detect the
935  * event that tree block loses its owner tree's reference and do the
936  * back refs conversion.
937  *
938  * When a tree block is COWed through a tree, there are four cases:
939  *
940  * The reference count of the block is one and the tree is the block's
941  * owner tree. Nothing to do in this case.
942  *
943  * The reference count of the block is one and the tree is not the
944  * block's owner tree. In this case, full back refs is used for pointers
945  * in the block. Remove these full back refs, add implicit back refs for
946  * every pointers in the new block.
947  *
948  * The reference count of the block is greater than one and the tree is
949  * the block's owner tree. In this case, implicit back refs is used for
950  * pointers in the block. Add full back refs for every pointers in the
951  * block, increase lower level extents' reference counts. The original
952  * implicit back refs are entailed to the new block.
953  *
954  * The reference count of the block is greater than one and the tree is
955  * not the block's owner tree. Add implicit back refs for every pointer in
956  * the new block, increase lower level extents' reference count.
957  *
958  * Back Reference Key composing:
959  *
960  * The key objectid corresponds to the first byte in the extent,
961  * The key type is used to differentiate between types of back refs.
962  * There are different meanings of the key offset for different types
963  * of back refs.
964  *
965  * File extents can be referenced by:
966  *
967  * - multiple snapshots, subvolumes, or different generations in one subvol
968  * - different files inside a single subvolume
969  * - different offsets inside a file (bookend extents in file.c)
970  *
971  * The extent ref structure for the implicit back refs has fields for:
972  *
973  * - Objectid of the subvolume root
974  * - objectid of the file holding the reference
975  * - original offset in the file
976  * - how many bookend extents
977  *
978  * The key offset for the implicit back refs is hash of the first
979  * three fields.
980  *
981  * The extent ref structure for the full back refs has field for:
982  *
983  * - number of pointers in the tree leaf
984  *
985  * The key offset for the implicit back refs is the first byte of
986  * the tree leaf
987  *
988  * When a file extent is allocated, The implicit back refs is used.
989  * the fields are filled in:
990  *
991  *     (root_key.objectid, inode objectid, offset in file, 1)
992  *
993  * When a file extent is removed file truncation, we find the
994  * corresponding implicit back refs and check the following fields:
995  *
996  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
997  *
998  * Btree extents can be referenced by:
999  *
1000  * - Different subvolumes
1001  *
1002  * Both the implicit back refs and the full back refs for tree blocks
1003  * only consist of key. The key offset for the implicit back refs is
1004  * objectid of block's owner tree. The key offset for the full back refs
1005  * is the first byte of parent block.
1006  *
1007  * When implicit back refs is used, information about the lowest key and
1008  * level of the tree block are required. These information are stored in
1009  * tree block info structure.
1010  */
1011
1012 /*
1013  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1014  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
1015  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1016  */
1017 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1018                                      struct btrfs_extent_inline_ref *iref,
1019                                      enum btrfs_inline_ref_type is_data)
1020 {
1021         int type = btrfs_extent_inline_ref_type(eb, iref);
1022         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1023
1024         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1025             type == BTRFS_SHARED_BLOCK_REF_KEY ||
1026             type == BTRFS_SHARED_DATA_REF_KEY ||
1027             type == BTRFS_EXTENT_DATA_REF_KEY) {
1028                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1029                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
1030                                 return type;
1031                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1032                                 ASSERT(eb->fs_info);
1033                                 /*
1034                                  * Every shared one has parent tree
1035                                  * block, which must be aligned to
1036                                  * nodesize.
1037                                  */
1038                                 if (offset &&
1039                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1040                                         return type;
1041                         }
1042                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1043                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
1044                                 return type;
1045                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
1046                                 ASSERT(eb->fs_info);
1047                                 /*
1048                                  * Every shared one has parent tree
1049                                  * block, which must be aligned to
1050                                  * nodesize.
1051                                  */
1052                                 if (offset &&
1053                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1054                                         return type;
1055                         }
1056                 } else {
1057                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1058                         return type;
1059                 }
1060         }
1061
1062         btrfs_print_leaf((struct extent_buffer *)eb);
1063         btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1064                   eb->start, type);
1065         WARN_ON(1);
1066
1067         return BTRFS_REF_TYPE_INVALID;
1068 }
1069
1070 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1071 {
1072         u32 high_crc = ~(u32)0;
1073         u32 low_crc = ~(u32)0;
1074         __le64 lenum;
1075
1076         lenum = cpu_to_le64(root_objectid);
1077         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1078         lenum = cpu_to_le64(owner);
1079         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1080         lenum = cpu_to_le64(offset);
1081         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1082
1083         return ((u64)high_crc << 31) ^ (u64)low_crc;
1084 }
1085
1086 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1087                                      struct btrfs_extent_data_ref *ref)
1088 {
1089         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1090                                     btrfs_extent_data_ref_objectid(leaf, ref),
1091                                     btrfs_extent_data_ref_offset(leaf, ref));
1092 }
1093
1094 static int match_extent_data_ref(struct extent_buffer *leaf,
1095                                  struct btrfs_extent_data_ref *ref,
1096                                  u64 root_objectid, u64 owner, u64 offset)
1097 {
1098         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1099             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1100             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1101                 return 0;
1102         return 1;
1103 }
1104
1105 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1106                                            struct btrfs_path *path,
1107                                            u64 bytenr, u64 parent,
1108                                            u64 root_objectid,
1109                                            u64 owner, u64 offset)
1110 {
1111         struct btrfs_root *root = trans->fs_info->extent_root;
1112         struct btrfs_key key;
1113         struct btrfs_extent_data_ref *ref;
1114         struct extent_buffer *leaf;
1115         u32 nritems;
1116         int ret;
1117         int recow;
1118         int err = -ENOENT;
1119
1120         key.objectid = bytenr;
1121         if (parent) {
1122                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1123                 key.offset = parent;
1124         } else {
1125                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1126                 key.offset = hash_extent_data_ref(root_objectid,
1127                                                   owner, offset);
1128         }
1129 again:
1130         recow = 0;
1131         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1132         if (ret < 0) {
1133                 err = ret;
1134                 goto fail;
1135         }
1136
1137         if (parent) {
1138                 if (!ret)
1139                         return 0;
1140                 goto fail;
1141         }
1142
1143         leaf = path->nodes[0];
1144         nritems = btrfs_header_nritems(leaf);
1145         while (1) {
1146                 if (path->slots[0] >= nritems) {
1147                         ret = btrfs_next_leaf(root, path);
1148                         if (ret < 0)
1149                                 err = ret;
1150                         if (ret)
1151                                 goto fail;
1152
1153                         leaf = path->nodes[0];
1154                         nritems = btrfs_header_nritems(leaf);
1155                         recow = 1;
1156                 }
1157
1158                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1159                 if (key.objectid != bytenr ||
1160                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1161                         goto fail;
1162
1163                 ref = btrfs_item_ptr(leaf, path->slots[0],
1164                                      struct btrfs_extent_data_ref);
1165
1166                 if (match_extent_data_ref(leaf, ref, root_objectid,
1167                                           owner, offset)) {
1168                         if (recow) {
1169                                 btrfs_release_path(path);
1170                                 goto again;
1171                         }
1172                         err = 0;
1173                         break;
1174                 }
1175                 path->slots[0]++;
1176         }
1177 fail:
1178         return err;
1179 }
1180
1181 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1182                                            struct btrfs_path *path,
1183                                            u64 bytenr, u64 parent,
1184                                            u64 root_objectid, u64 owner,
1185                                            u64 offset, int refs_to_add)
1186 {
1187         struct btrfs_root *root = trans->fs_info->extent_root;
1188         struct btrfs_key key;
1189         struct extent_buffer *leaf;
1190         u32 size;
1191         u32 num_refs;
1192         int ret;
1193
1194         key.objectid = bytenr;
1195         if (parent) {
1196                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1197                 key.offset = parent;
1198                 size = sizeof(struct btrfs_shared_data_ref);
1199         } else {
1200                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1201                 key.offset = hash_extent_data_ref(root_objectid,
1202                                                   owner, offset);
1203                 size = sizeof(struct btrfs_extent_data_ref);
1204         }
1205
1206         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1207         if (ret && ret != -EEXIST)
1208                 goto fail;
1209
1210         leaf = path->nodes[0];
1211         if (parent) {
1212                 struct btrfs_shared_data_ref *ref;
1213                 ref = btrfs_item_ptr(leaf, path->slots[0],
1214                                      struct btrfs_shared_data_ref);
1215                 if (ret == 0) {
1216                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1217                 } else {
1218                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1219                         num_refs += refs_to_add;
1220                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1221                 }
1222         } else {
1223                 struct btrfs_extent_data_ref *ref;
1224                 while (ret == -EEXIST) {
1225                         ref = btrfs_item_ptr(leaf, path->slots[0],
1226                                              struct btrfs_extent_data_ref);
1227                         if (match_extent_data_ref(leaf, ref, root_objectid,
1228                                                   owner, offset))
1229                                 break;
1230                         btrfs_release_path(path);
1231                         key.offset++;
1232                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1233                                                       size);
1234                         if (ret && ret != -EEXIST)
1235                                 goto fail;
1236
1237                         leaf = path->nodes[0];
1238                 }
1239                 ref = btrfs_item_ptr(leaf, path->slots[0],
1240                                      struct btrfs_extent_data_ref);
1241                 if (ret == 0) {
1242                         btrfs_set_extent_data_ref_root(leaf, ref,
1243                                                        root_objectid);
1244                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1245                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1246                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1247                 } else {
1248                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1249                         num_refs += refs_to_add;
1250                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1251                 }
1252         }
1253         btrfs_mark_buffer_dirty(leaf);
1254         ret = 0;
1255 fail:
1256         btrfs_release_path(path);
1257         return ret;
1258 }
1259
1260 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1261                                            struct btrfs_path *path,
1262                                            int refs_to_drop, int *last_ref)
1263 {
1264         struct btrfs_key key;
1265         struct btrfs_extent_data_ref *ref1 = NULL;
1266         struct btrfs_shared_data_ref *ref2 = NULL;
1267         struct extent_buffer *leaf;
1268         u32 num_refs = 0;
1269         int ret = 0;
1270
1271         leaf = path->nodes[0];
1272         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1273
1274         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1275                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1276                                       struct btrfs_extent_data_ref);
1277                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1278         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1279                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1280                                       struct btrfs_shared_data_ref);
1281                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1282         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1283                 btrfs_print_v0_err(trans->fs_info);
1284                 btrfs_abort_transaction(trans, -EINVAL);
1285                 return -EINVAL;
1286         } else {
1287                 BUG();
1288         }
1289
1290         BUG_ON(num_refs < refs_to_drop);
1291         num_refs -= refs_to_drop;
1292
1293         if (num_refs == 0) {
1294                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1295                 *last_ref = 1;
1296         } else {
1297                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1298                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1299                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1300                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1301                 btrfs_mark_buffer_dirty(leaf);
1302         }
1303         return ret;
1304 }
1305
1306 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1307                                           struct btrfs_extent_inline_ref *iref)
1308 {
1309         struct btrfs_key key;
1310         struct extent_buffer *leaf;
1311         struct btrfs_extent_data_ref *ref1;
1312         struct btrfs_shared_data_ref *ref2;
1313         u32 num_refs = 0;
1314         int type;
1315
1316         leaf = path->nodes[0];
1317         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1318
1319         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1320         if (iref) {
1321                 /*
1322                  * If type is invalid, we should have bailed out earlier than
1323                  * this call.
1324                  */
1325                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1326                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1327                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1328                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1329                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1330                 } else {
1331                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1332                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1333                 }
1334         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1335                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1336                                       struct btrfs_extent_data_ref);
1337                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1338         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1339                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1340                                       struct btrfs_shared_data_ref);
1341                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1342         } else {
1343                 WARN_ON(1);
1344         }
1345         return num_refs;
1346 }
1347
1348 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1349                                           struct btrfs_path *path,
1350                                           u64 bytenr, u64 parent,
1351                                           u64 root_objectid)
1352 {
1353         struct btrfs_root *root = trans->fs_info->extent_root;
1354         struct btrfs_key key;
1355         int ret;
1356
1357         key.objectid = bytenr;
1358         if (parent) {
1359                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1360                 key.offset = parent;
1361         } else {
1362                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1363                 key.offset = root_objectid;
1364         }
1365
1366         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1367         if (ret > 0)
1368                 ret = -ENOENT;
1369         return ret;
1370 }
1371
1372 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1373                                           struct btrfs_path *path,
1374                                           u64 bytenr, u64 parent,
1375                                           u64 root_objectid)
1376 {
1377         struct btrfs_key key;
1378         int ret;
1379
1380         key.objectid = bytenr;
1381         if (parent) {
1382                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1383                 key.offset = parent;
1384         } else {
1385                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1386                 key.offset = root_objectid;
1387         }
1388
1389         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1390                                       path, &key, 0);
1391         btrfs_release_path(path);
1392         return ret;
1393 }
1394
1395 static inline int extent_ref_type(u64 parent, u64 owner)
1396 {
1397         int type;
1398         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1399                 if (parent > 0)
1400                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1401                 else
1402                         type = BTRFS_TREE_BLOCK_REF_KEY;
1403         } else {
1404                 if (parent > 0)
1405                         type = BTRFS_SHARED_DATA_REF_KEY;
1406                 else
1407                         type = BTRFS_EXTENT_DATA_REF_KEY;
1408         }
1409         return type;
1410 }
1411
1412 static int find_next_key(struct btrfs_path *path, int level,
1413                          struct btrfs_key *key)
1414
1415 {
1416         for (; level < BTRFS_MAX_LEVEL; level++) {
1417                 if (!path->nodes[level])
1418                         break;
1419                 if (path->slots[level] + 1 >=
1420                     btrfs_header_nritems(path->nodes[level]))
1421                         continue;
1422                 if (level == 0)
1423                         btrfs_item_key_to_cpu(path->nodes[level], key,
1424                                               path->slots[level] + 1);
1425                 else
1426                         btrfs_node_key_to_cpu(path->nodes[level], key,
1427                                               path->slots[level] + 1);
1428                 return 0;
1429         }
1430         return 1;
1431 }
1432
1433 /*
1434  * look for inline back ref. if back ref is found, *ref_ret is set
1435  * to the address of inline back ref, and 0 is returned.
1436  *
1437  * if back ref isn't found, *ref_ret is set to the address where it
1438  * should be inserted, and -ENOENT is returned.
1439  *
1440  * if insert is true and there are too many inline back refs, the path
1441  * points to the extent item, and -EAGAIN is returned.
1442  *
1443  * NOTE: inline back refs are ordered in the same way that back ref
1444  *       items in the tree are ordered.
1445  */
1446 static noinline_for_stack
1447 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1448                                  struct btrfs_path *path,
1449                                  struct btrfs_extent_inline_ref **ref_ret,
1450                                  u64 bytenr, u64 num_bytes,
1451                                  u64 parent, u64 root_objectid,
1452                                  u64 owner, u64 offset, int insert)
1453 {
1454         struct btrfs_fs_info *fs_info = trans->fs_info;
1455         struct btrfs_root *root = fs_info->extent_root;
1456         struct btrfs_key key;
1457         struct extent_buffer *leaf;
1458         struct btrfs_extent_item *ei;
1459         struct btrfs_extent_inline_ref *iref;
1460         u64 flags;
1461         u64 item_size;
1462         unsigned long ptr;
1463         unsigned long end;
1464         int extra_size;
1465         int type;
1466         int want;
1467         int ret;
1468         int err = 0;
1469         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1470         int needed;
1471
1472         key.objectid = bytenr;
1473         key.type = BTRFS_EXTENT_ITEM_KEY;
1474         key.offset = num_bytes;
1475
1476         want = extent_ref_type(parent, owner);
1477         if (insert) {
1478                 extra_size = btrfs_extent_inline_ref_size(want);
1479                 path->keep_locks = 1;
1480         } else
1481                 extra_size = -1;
1482
1483         /*
1484          * Owner is our level, so we can just add one to get the level for the
1485          * block we are interested in.
1486          */
1487         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1488                 key.type = BTRFS_METADATA_ITEM_KEY;
1489                 key.offset = owner;
1490         }
1491
1492 again:
1493         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1494         if (ret < 0) {
1495                 err = ret;
1496                 goto out;
1497         }
1498
1499         /*
1500          * We may be a newly converted file system which still has the old fat
1501          * extent entries for metadata, so try and see if we have one of those.
1502          */
1503         if (ret > 0 && skinny_metadata) {
1504                 skinny_metadata = false;
1505                 if (path->slots[0]) {
1506                         path->slots[0]--;
1507                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1508                                               path->slots[0]);
1509                         if (key.objectid == bytenr &&
1510                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1511                             key.offset == num_bytes)
1512                                 ret = 0;
1513                 }
1514                 if (ret) {
1515                         key.objectid = bytenr;
1516                         key.type = BTRFS_EXTENT_ITEM_KEY;
1517                         key.offset = num_bytes;
1518                         btrfs_release_path(path);
1519                         goto again;
1520                 }
1521         }
1522
1523         if (ret && !insert) {
1524                 err = -ENOENT;
1525                 goto out;
1526         } else if (WARN_ON(ret)) {
1527                 err = -EIO;
1528                 goto out;
1529         }
1530
1531         leaf = path->nodes[0];
1532         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1533         if (unlikely(item_size < sizeof(*ei))) {
1534                 err = -EINVAL;
1535                 btrfs_print_v0_err(fs_info);
1536                 btrfs_abort_transaction(trans, err);
1537                 goto out;
1538         }
1539
1540         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1541         flags = btrfs_extent_flags(leaf, ei);
1542
1543         ptr = (unsigned long)(ei + 1);
1544         end = (unsigned long)ei + item_size;
1545
1546         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1547                 ptr += sizeof(struct btrfs_tree_block_info);
1548                 BUG_ON(ptr > end);
1549         }
1550
1551         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1552                 needed = BTRFS_REF_TYPE_DATA;
1553         else
1554                 needed = BTRFS_REF_TYPE_BLOCK;
1555
1556         err = -ENOENT;
1557         while (1) {
1558                 if (ptr >= end) {
1559                         WARN_ON(ptr > end);
1560                         break;
1561                 }
1562                 iref = (struct btrfs_extent_inline_ref *)ptr;
1563                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1564                 if (type == BTRFS_REF_TYPE_INVALID) {
1565                         err = -EUCLEAN;
1566                         goto out;
1567                 }
1568
1569                 if (want < type)
1570                         break;
1571                 if (want > type) {
1572                         ptr += btrfs_extent_inline_ref_size(type);
1573                         continue;
1574                 }
1575
1576                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1577                         struct btrfs_extent_data_ref *dref;
1578                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1579                         if (match_extent_data_ref(leaf, dref, root_objectid,
1580                                                   owner, offset)) {
1581                                 err = 0;
1582                                 break;
1583                         }
1584                         if (hash_extent_data_ref_item(leaf, dref) <
1585                             hash_extent_data_ref(root_objectid, owner, offset))
1586                                 break;
1587                 } else {
1588                         u64 ref_offset;
1589                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1590                         if (parent > 0) {
1591                                 if (parent == ref_offset) {
1592                                         err = 0;
1593                                         break;
1594                                 }
1595                                 if (ref_offset < parent)
1596                                         break;
1597                         } else {
1598                                 if (root_objectid == ref_offset) {
1599                                         err = 0;
1600                                         break;
1601                                 }
1602                                 if (ref_offset < root_objectid)
1603                                         break;
1604                         }
1605                 }
1606                 ptr += btrfs_extent_inline_ref_size(type);
1607         }
1608         if (err == -ENOENT && insert) {
1609                 if (item_size + extra_size >=
1610                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1611                         err = -EAGAIN;
1612                         goto out;
1613                 }
1614                 /*
1615                  * To add new inline back ref, we have to make sure
1616                  * there is no corresponding back ref item.
1617                  * For simplicity, we just do not add new inline back
1618                  * ref if there is any kind of item for this block
1619                  */
1620                 if (find_next_key(path, 0, &key) == 0 &&
1621                     key.objectid == bytenr &&
1622                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1623                         err = -EAGAIN;
1624                         goto out;
1625                 }
1626         }
1627         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1628 out:
1629         if (insert) {
1630                 path->keep_locks = 0;
1631                 btrfs_unlock_up_safe(path, 1);
1632         }
1633         return err;
1634 }
1635
1636 /*
1637  * helper to add new inline back ref
1638  */
1639 static noinline_for_stack
1640 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1641                                  struct btrfs_path *path,
1642                                  struct btrfs_extent_inline_ref *iref,
1643                                  u64 parent, u64 root_objectid,
1644                                  u64 owner, u64 offset, int refs_to_add,
1645                                  struct btrfs_delayed_extent_op *extent_op)
1646 {
1647         struct extent_buffer *leaf;
1648         struct btrfs_extent_item *ei;
1649         unsigned long ptr;
1650         unsigned long end;
1651         unsigned long item_offset;
1652         u64 refs;
1653         int size;
1654         int type;
1655
1656         leaf = path->nodes[0];
1657         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1658         item_offset = (unsigned long)iref - (unsigned long)ei;
1659
1660         type = extent_ref_type(parent, owner);
1661         size = btrfs_extent_inline_ref_size(type);
1662
1663         btrfs_extend_item(path, size);
1664
1665         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1666         refs = btrfs_extent_refs(leaf, ei);
1667         refs += refs_to_add;
1668         btrfs_set_extent_refs(leaf, ei, refs);
1669         if (extent_op)
1670                 __run_delayed_extent_op(extent_op, leaf, ei);
1671
1672         ptr = (unsigned long)ei + item_offset;
1673         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1674         if (ptr < end - size)
1675                 memmove_extent_buffer(leaf, ptr + size, ptr,
1676                                       end - size - ptr);
1677
1678         iref = (struct btrfs_extent_inline_ref *)ptr;
1679         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1680         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1681                 struct btrfs_extent_data_ref *dref;
1682                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1683                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1684                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1685                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1686                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1687         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1688                 struct btrfs_shared_data_ref *sref;
1689                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1690                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1691                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1692         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1693                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1694         } else {
1695                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1696         }
1697         btrfs_mark_buffer_dirty(leaf);
1698 }
1699
1700 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1701                                  struct btrfs_path *path,
1702                                  struct btrfs_extent_inline_ref **ref_ret,
1703                                  u64 bytenr, u64 num_bytes, u64 parent,
1704                                  u64 root_objectid, u64 owner, u64 offset)
1705 {
1706         int ret;
1707
1708         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1709                                            num_bytes, parent, root_objectid,
1710                                            owner, offset, 0);
1711         if (ret != -ENOENT)
1712                 return ret;
1713
1714         btrfs_release_path(path);
1715         *ref_ret = NULL;
1716
1717         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1718                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1719                                             root_objectid);
1720         } else {
1721                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1722                                              root_objectid, owner, offset);
1723         }
1724         return ret;
1725 }
1726
1727 /*
1728  * helper to update/remove inline back ref
1729  */
1730 static noinline_for_stack
1731 void update_inline_extent_backref(struct btrfs_path *path,
1732                                   struct btrfs_extent_inline_ref *iref,
1733                                   int refs_to_mod,
1734                                   struct btrfs_delayed_extent_op *extent_op,
1735                                   int *last_ref)
1736 {
1737         struct extent_buffer *leaf = path->nodes[0];
1738         struct btrfs_extent_item *ei;
1739         struct btrfs_extent_data_ref *dref = NULL;
1740         struct btrfs_shared_data_ref *sref = NULL;
1741         unsigned long ptr;
1742         unsigned long end;
1743         u32 item_size;
1744         int size;
1745         int type;
1746         u64 refs;
1747
1748         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1749         refs = btrfs_extent_refs(leaf, ei);
1750         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1751         refs += refs_to_mod;
1752         btrfs_set_extent_refs(leaf, ei, refs);
1753         if (extent_op)
1754                 __run_delayed_extent_op(extent_op, leaf, ei);
1755
1756         /*
1757          * If type is invalid, we should have bailed out after
1758          * lookup_inline_extent_backref().
1759          */
1760         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1761         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1762
1763         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1764                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1765                 refs = btrfs_extent_data_ref_count(leaf, dref);
1766         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1767                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1768                 refs = btrfs_shared_data_ref_count(leaf, sref);
1769         } else {
1770                 refs = 1;
1771                 BUG_ON(refs_to_mod != -1);
1772         }
1773
1774         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1775         refs += refs_to_mod;
1776
1777         if (refs > 0) {
1778                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1779                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1780                 else
1781                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1782         } else {
1783                 *last_ref = 1;
1784                 size =  btrfs_extent_inline_ref_size(type);
1785                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1786                 ptr = (unsigned long)iref;
1787                 end = (unsigned long)ei + item_size;
1788                 if (ptr + size < end)
1789                         memmove_extent_buffer(leaf, ptr, ptr + size,
1790                                               end - ptr - size);
1791                 item_size -= size;
1792                 btrfs_truncate_item(path, item_size, 1);
1793         }
1794         btrfs_mark_buffer_dirty(leaf);
1795 }
1796
1797 static noinline_for_stack
1798 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1799                                  struct btrfs_path *path,
1800                                  u64 bytenr, u64 num_bytes, u64 parent,
1801                                  u64 root_objectid, u64 owner,
1802                                  u64 offset, int refs_to_add,
1803                                  struct btrfs_delayed_extent_op *extent_op)
1804 {
1805         struct btrfs_extent_inline_ref *iref;
1806         int ret;
1807
1808         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1809                                            num_bytes, parent, root_objectid,
1810                                            owner, offset, 1);
1811         if (ret == 0) {
1812                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1813                 update_inline_extent_backref(path, iref, refs_to_add,
1814                                              extent_op, NULL);
1815         } else if (ret == -ENOENT) {
1816                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1817                                             root_objectid, owner, offset,
1818                                             refs_to_add, extent_op);
1819                 ret = 0;
1820         }
1821         return ret;
1822 }
1823
1824 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1825                                  struct btrfs_path *path,
1826                                  u64 bytenr, u64 parent, u64 root_objectid,
1827                                  u64 owner, u64 offset, int refs_to_add)
1828 {
1829         int ret;
1830         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1831                 BUG_ON(refs_to_add != 1);
1832                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1833                                             root_objectid);
1834         } else {
1835                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1836                                              root_objectid, owner, offset,
1837                                              refs_to_add);
1838         }
1839         return ret;
1840 }
1841
1842 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1843                                  struct btrfs_path *path,
1844                                  struct btrfs_extent_inline_ref *iref,
1845                                  int refs_to_drop, int is_data, int *last_ref)
1846 {
1847         int ret = 0;
1848
1849         BUG_ON(!is_data && refs_to_drop != 1);
1850         if (iref) {
1851                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1852                                              last_ref);
1853         } else if (is_data) {
1854                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1855                                              last_ref);
1856         } else {
1857                 *last_ref = 1;
1858                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1859         }
1860         return ret;
1861 }
1862
1863 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1864                                u64 *discarded_bytes)
1865 {
1866         int j, ret = 0;
1867         u64 bytes_left, end;
1868         u64 aligned_start = ALIGN(start, 1 << 9);
1869
1870         if (WARN_ON(start != aligned_start)) {
1871                 len -= aligned_start - start;
1872                 len = round_down(len, 1 << 9);
1873                 start = aligned_start;
1874         }
1875
1876         *discarded_bytes = 0;
1877
1878         if (!len)
1879                 return 0;
1880
1881         end = start + len;
1882         bytes_left = len;
1883
1884         /* Skip any superblocks on this device. */
1885         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1886                 u64 sb_start = btrfs_sb_offset(j);
1887                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1888                 u64 size = sb_start - start;
1889
1890                 if (!in_range(sb_start, start, bytes_left) &&
1891                     !in_range(sb_end, start, bytes_left) &&
1892                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1893                         continue;
1894
1895                 /*
1896                  * Superblock spans beginning of range.  Adjust start and
1897                  * try again.
1898                  */
1899                 if (sb_start <= start) {
1900                         start += sb_end - start;
1901                         if (start > end) {
1902                                 bytes_left = 0;
1903                                 break;
1904                         }
1905                         bytes_left = end - start;
1906                         continue;
1907                 }
1908
1909                 if (size) {
1910                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1911                                                    GFP_NOFS, 0);
1912                         if (!ret)
1913                                 *discarded_bytes += size;
1914                         else if (ret != -EOPNOTSUPP)
1915                                 return ret;
1916                 }
1917
1918                 start = sb_end;
1919                 if (start > end) {
1920                         bytes_left = 0;
1921                         break;
1922                 }
1923                 bytes_left = end - start;
1924         }
1925
1926         if (bytes_left) {
1927                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1928                                            GFP_NOFS, 0);
1929                 if (!ret)
1930                         *discarded_bytes += bytes_left;
1931         }
1932         return ret;
1933 }
1934
1935 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1936                          u64 num_bytes, u64 *actual_bytes)
1937 {
1938         int ret;
1939         u64 discarded_bytes = 0;
1940         struct btrfs_bio *bbio = NULL;
1941
1942
1943         /*
1944          * Avoid races with device replace and make sure our bbio has devices
1945          * associated to its stripes that don't go away while we are discarding.
1946          */
1947         btrfs_bio_counter_inc_blocked(fs_info);
1948         /* Tell the block device(s) that the sectors can be discarded */
1949         ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1950                               &bbio, 0);
1951         /* Error condition is -ENOMEM */
1952         if (!ret) {
1953                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1954                 int i;
1955
1956
1957                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1958                         u64 bytes;
1959                         struct request_queue *req_q;
1960
1961                         if (!stripe->dev->bdev) {
1962                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1963                                 continue;
1964                         }
1965                         req_q = bdev_get_queue(stripe->dev->bdev);
1966                         if (!blk_queue_discard(req_q))
1967                                 continue;
1968
1969                         ret = btrfs_issue_discard(stripe->dev->bdev,
1970                                                   stripe->physical,
1971                                                   stripe->length,
1972                                                   &bytes);
1973                         if (!ret)
1974                                 discarded_bytes += bytes;
1975                         else if (ret != -EOPNOTSUPP)
1976                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1977
1978                         /*
1979                          * Just in case we get back EOPNOTSUPP for some reason,
1980                          * just ignore the return value so we don't screw up
1981                          * people calling discard_extent.
1982                          */
1983                         ret = 0;
1984                 }
1985                 btrfs_put_bbio(bbio);
1986         }
1987         btrfs_bio_counter_dec(fs_info);
1988
1989         if (actual_bytes)
1990                 *actual_bytes = discarded_bytes;
1991
1992
1993         if (ret == -EOPNOTSUPP)
1994                 ret = 0;
1995         return ret;
1996 }
1997
1998 /* Can return -ENOMEM */
1999 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2000                          struct btrfs_ref *generic_ref)
2001 {
2002         struct btrfs_fs_info *fs_info = trans->fs_info;
2003         int old_ref_mod, new_ref_mod;
2004         int ret;
2005
2006         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
2007                generic_ref->action);
2008         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
2009                generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
2010
2011         if (generic_ref->type == BTRFS_REF_METADATA)
2012                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
2013                                 NULL, &old_ref_mod, &new_ref_mod);
2014         else
2015                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
2016                                                  &old_ref_mod, &new_ref_mod);
2017
2018         btrfs_ref_tree_mod(fs_info, generic_ref);
2019
2020         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2021                 sub_pinned_bytes(fs_info, generic_ref);
2022
2023         return ret;
2024 }
2025
2026 /*
2027  * __btrfs_inc_extent_ref - insert backreference for a given extent
2028  *
2029  * @trans:          Handle of transaction
2030  *
2031  * @node:           The delayed ref node used to get the bytenr/length for
2032  *                  extent whose references are incremented.
2033  *
2034  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2035  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2036  *                  bytenr of the parent block. Since new extents are always
2037  *                  created with indirect references, this will only be the case
2038  *                  when relocating a shared extent. In that case, root_objectid
2039  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2040  *                  be 0
2041  *
2042  * @root_objectid:  The id of the root where this modification has originated,
2043  *                  this can be either one of the well-known metadata trees or
2044  *                  the subvolume id which references this extent.
2045  *
2046  * @owner:          For data extents it is the inode number of the owning file.
2047  *                  For metadata extents this parameter holds the level in the
2048  *                  tree of the extent.
2049  *
2050  * @offset:         For metadata extents the offset is ignored and is currently
2051  *                  always passed as 0. For data extents it is the fileoffset
2052  *                  this extent belongs to.
2053  *
2054  * @refs_to_add     Number of references to add
2055  *
2056  * @extent_op       Pointer to a structure, holding information necessary when
2057  *                  updating a tree block's flags
2058  *
2059  */
2060 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2061                                   struct btrfs_delayed_ref_node *node,
2062                                   u64 parent, u64 root_objectid,
2063                                   u64 owner, u64 offset, int refs_to_add,
2064                                   struct btrfs_delayed_extent_op *extent_op)
2065 {
2066         struct btrfs_path *path;
2067         struct extent_buffer *leaf;
2068         struct btrfs_extent_item *item;
2069         struct btrfs_key key;
2070         u64 bytenr = node->bytenr;
2071         u64 num_bytes = node->num_bytes;
2072         u64 refs;
2073         int ret;
2074
2075         path = btrfs_alloc_path();
2076         if (!path)
2077                 return -ENOMEM;
2078
2079         path->reada = READA_FORWARD;
2080         path->leave_spinning = 1;
2081         /* this will setup the path even if it fails to insert the back ref */
2082         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2083                                            parent, root_objectid, owner,
2084                                            offset, refs_to_add, extent_op);
2085         if ((ret < 0 && ret != -EAGAIN) || !ret)
2086                 goto out;
2087
2088         /*
2089          * Ok we had -EAGAIN which means we didn't have space to insert and
2090          * inline extent ref, so just update the reference count and add a
2091          * normal backref.
2092          */
2093         leaf = path->nodes[0];
2094         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2095         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2096         refs = btrfs_extent_refs(leaf, item);
2097         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2098         if (extent_op)
2099                 __run_delayed_extent_op(extent_op, leaf, item);
2100
2101         btrfs_mark_buffer_dirty(leaf);
2102         btrfs_release_path(path);
2103
2104         path->reada = READA_FORWARD;
2105         path->leave_spinning = 1;
2106         /* now insert the actual backref */
2107         ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2108                                     owner, offset, refs_to_add);
2109         if (ret)
2110                 btrfs_abort_transaction(trans, ret);
2111 out:
2112         btrfs_free_path(path);
2113         return ret;
2114 }
2115
2116 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2117                                 struct btrfs_delayed_ref_node *node,
2118                                 struct btrfs_delayed_extent_op *extent_op,
2119                                 int insert_reserved)
2120 {
2121         int ret = 0;
2122         struct btrfs_delayed_data_ref *ref;
2123         struct btrfs_key ins;
2124         u64 parent = 0;
2125         u64 ref_root = 0;
2126         u64 flags = 0;
2127
2128         ins.objectid = node->bytenr;
2129         ins.offset = node->num_bytes;
2130         ins.type = BTRFS_EXTENT_ITEM_KEY;
2131
2132         ref = btrfs_delayed_node_to_data_ref(node);
2133         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2134
2135         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2136                 parent = ref->parent;
2137         ref_root = ref->root;
2138
2139         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2140                 if (extent_op)
2141                         flags |= extent_op->flags_to_set;
2142                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2143                                                  flags, ref->objectid,
2144                                                  ref->offset, &ins,
2145                                                  node->ref_mod);
2146         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2147                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2148                                              ref->objectid, ref->offset,
2149                                              node->ref_mod, extent_op);
2150         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2151                 ret = __btrfs_free_extent(trans, node, parent,
2152                                           ref_root, ref->objectid,
2153                                           ref->offset, node->ref_mod,
2154                                           extent_op);
2155         } else {
2156                 BUG();
2157         }
2158         return ret;
2159 }
2160
2161 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2162                                     struct extent_buffer *leaf,
2163                                     struct btrfs_extent_item *ei)
2164 {
2165         u64 flags = btrfs_extent_flags(leaf, ei);
2166         if (extent_op->update_flags) {
2167                 flags |= extent_op->flags_to_set;
2168                 btrfs_set_extent_flags(leaf, ei, flags);
2169         }
2170
2171         if (extent_op->update_key) {
2172                 struct btrfs_tree_block_info *bi;
2173                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2174                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2175                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2176         }
2177 }
2178
2179 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2180                                  struct btrfs_delayed_ref_head *head,
2181                                  struct btrfs_delayed_extent_op *extent_op)
2182 {
2183         struct btrfs_fs_info *fs_info = trans->fs_info;
2184         struct btrfs_key key;
2185         struct btrfs_path *path;
2186         struct btrfs_extent_item *ei;
2187         struct extent_buffer *leaf;
2188         u32 item_size;
2189         int ret;
2190         int err = 0;
2191         int metadata = !extent_op->is_data;
2192
2193         if (trans->aborted)
2194                 return 0;
2195
2196         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2197                 metadata = 0;
2198
2199         path = btrfs_alloc_path();
2200         if (!path)
2201                 return -ENOMEM;
2202
2203         key.objectid = head->bytenr;
2204
2205         if (metadata) {
2206                 key.type = BTRFS_METADATA_ITEM_KEY;
2207                 key.offset = extent_op->level;
2208         } else {
2209                 key.type = BTRFS_EXTENT_ITEM_KEY;
2210                 key.offset = head->num_bytes;
2211         }
2212
2213 again:
2214         path->reada = READA_FORWARD;
2215         path->leave_spinning = 1;
2216         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2217         if (ret < 0) {
2218                 err = ret;
2219                 goto out;
2220         }
2221         if (ret > 0) {
2222                 if (metadata) {
2223                         if (path->slots[0] > 0) {
2224                                 path->slots[0]--;
2225                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2226                                                       path->slots[0]);
2227                                 if (key.objectid == head->bytenr &&
2228                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2229                                     key.offset == head->num_bytes)
2230                                         ret = 0;
2231                         }
2232                         if (ret > 0) {
2233                                 btrfs_release_path(path);
2234                                 metadata = 0;
2235
2236                                 key.objectid = head->bytenr;
2237                                 key.offset = head->num_bytes;
2238                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2239                                 goto again;
2240                         }
2241                 } else {
2242                         err = -EIO;
2243                         goto out;
2244                 }
2245         }
2246
2247         leaf = path->nodes[0];
2248         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2249
2250         if (unlikely(item_size < sizeof(*ei))) {
2251                 err = -EINVAL;
2252                 btrfs_print_v0_err(fs_info);
2253                 btrfs_abort_transaction(trans, err);
2254                 goto out;
2255         }
2256
2257         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2258         __run_delayed_extent_op(extent_op, leaf, ei);
2259
2260         btrfs_mark_buffer_dirty(leaf);
2261 out:
2262         btrfs_free_path(path);
2263         return err;
2264 }
2265
2266 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2267                                 struct btrfs_delayed_ref_node *node,
2268                                 struct btrfs_delayed_extent_op *extent_op,
2269                                 int insert_reserved)
2270 {
2271         int ret = 0;
2272         struct btrfs_delayed_tree_ref *ref;
2273         u64 parent = 0;
2274         u64 ref_root = 0;
2275
2276         ref = btrfs_delayed_node_to_tree_ref(node);
2277         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2278
2279         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2280                 parent = ref->parent;
2281         ref_root = ref->root;
2282
2283         if (node->ref_mod != 1) {
2284                 btrfs_err(trans->fs_info,
2285         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2286                           node->bytenr, node->ref_mod, node->action, ref_root,
2287                           parent);
2288                 return -EIO;
2289         }
2290         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2291                 BUG_ON(!extent_op || !extent_op->update_flags);
2292                 ret = alloc_reserved_tree_block(trans, node, extent_op);
2293         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2294                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2295                                              ref->level, 0, 1, extent_op);
2296         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2297                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2298                                           ref->level, 0, 1, extent_op);
2299         } else {
2300                 BUG();
2301         }
2302         return ret;
2303 }
2304
2305 /* helper function to actually process a single delayed ref entry */
2306 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2307                                struct btrfs_delayed_ref_node *node,
2308                                struct btrfs_delayed_extent_op *extent_op,
2309                                int insert_reserved)
2310 {
2311         int ret = 0;
2312
2313         if (trans->aborted) {
2314                 if (insert_reserved)
2315                         btrfs_pin_extent(trans->fs_info, node->bytenr,
2316                                          node->num_bytes, 1);
2317                 return 0;
2318         }
2319
2320         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2321             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2322                 ret = run_delayed_tree_ref(trans, node, extent_op,
2323                                            insert_reserved);
2324         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2325                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2326                 ret = run_delayed_data_ref(trans, node, extent_op,
2327                                            insert_reserved);
2328         else
2329                 BUG();
2330         if (ret && insert_reserved)
2331                 btrfs_pin_extent(trans->fs_info, node->bytenr,
2332                                  node->num_bytes, 1);
2333         return ret;
2334 }
2335
2336 static inline struct btrfs_delayed_ref_node *
2337 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2338 {
2339         struct btrfs_delayed_ref_node *ref;
2340
2341         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2342                 return NULL;
2343
2344         /*
2345          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2346          * This is to prevent a ref count from going down to zero, which deletes
2347          * the extent item from the extent tree, when there still are references
2348          * to add, which would fail because they would not find the extent item.
2349          */
2350         if (!list_empty(&head->ref_add_list))
2351                 return list_first_entry(&head->ref_add_list,
2352                                 struct btrfs_delayed_ref_node, add_list);
2353
2354         ref = rb_entry(rb_first_cached(&head->ref_tree),
2355                        struct btrfs_delayed_ref_node, ref_node);
2356         ASSERT(list_empty(&ref->add_list));
2357         return ref;
2358 }
2359
2360 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2361                                       struct btrfs_delayed_ref_head *head)
2362 {
2363         spin_lock(&delayed_refs->lock);
2364         head->processing = 0;
2365         delayed_refs->num_heads_ready++;
2366         spin_unlock(&delayed_refs->lock);
2367         btrfs_delayed_ref_unlock(head);
2368 }
2369
2370 static struct btrfs_delayed_extent_op *cleanup_extent_op(
2371                                 struct btrfs_delayed_ref_head *head)
2372 {
2373         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2374
2375         if (!extent_op)
2376                 return NULL;
2377
2378         if (head->must_insert_reserved) {
2379                 head->extent_op = NULL;
2380                 btrfs_free_delayed_extent_op(extent_op);
2381                 return NULL;
2382         }
2383         return extent_op;
2384 }
2385
2386 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2387                                      struct btrfs_delayed_ref_head *head)
2388 {
2389         struct btrfs_delayed_extent_op *extent_op;
2390         int ret;
2391
2392         extent_op = cleanup_extent_op(head);
2393         if (!extent_op)
2394                 return 0;
2395         head->extent_op = NULL;
2396         spin_unlock(&head->lock);
2397         ret = run_delayed_extent_op(trans, head, extent_op);
2398         btrfs_free_delayed_extent_op(extent_op);
2399         return ret ? ret : 1;
2400 }
2401
2402 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2403                                   struct btrfs_delayed_ref_root *delayed_refs,
2404                                   struct btrfs_delayed_ref_head *head)
2405 {
2406         int nr_items = 1;       /* Dropping this ref head update. */
2407
2408         if (head->total_ref_mod < 0) {
2409                 struct btrfs_space_info *space_info;
2410                 u64 flags;
2411
2412                 if (head->is_data)
2413                         flags = BTRFS_BLOCK_GROUP_DATA;
2414                 else if (head->is_system)
2415                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
2416                 else
2417                         flags = BTRFS_BLOCK_GROUP_METADATA;
2418                 space_info = btrfs_find_space_info(fs_info, flags);
2419                 ASSERT(space_info);
2420                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2421                                    -head->num_bytes,
2422                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
2423
2424                 /*
2425                  * We had csum deletions accounted for in our delayed refs rsv,
2426                  * we need to drop the csum leaves for this update from our
2427                  * delayed_refs_rsv.
2428                  */
2429                 if (head->is_data) {
2430                         spin_lock(&delayed_refs->lock);
2431                         delayed_refs->pending_csums -= head->num_bytes;
2432                         spin_unlock(&delayed_refs->lock);
2433                         nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2434                                 head->num_bytes);
2435                 }
2436         }
2437
2438         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2439 }
2440
2441 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2442                             struct btrfs_delayed_ref_head *head)
2443 {
2444
2445         struct btrfs_fs_info *fs_info = trans->fs_info;
2446         struct btrfs_delayed_ref_root *delayed_refs;
2447         int ret;
2448
2449         delayed_refs = &trans->transaction->delayed_refs;
2450
2451         ret = run_and_cleanup_extent_op(trans, head);
2452         if (ret < 0) {
2453                 unselect_delayed_ref_head(delayed_refs, head);
2454                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2455                 return ret;
2456         } else if (ret) {
2457                 return ret;
2458         }
2459
2460         /*
2461          * Need to drop our head ref lock and re-acquire the delayed ref lock
2462          * and then re-check to make sure nobody got added.
2463          */
2464         spin_unlock(&head->lock);
2465         spin_lock(&delayed_refs->lock);
2466         spin_lock(&head->lock);
2467         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2468                 spin_unlock(&head->lock);
2469                 spin_unlock(&delayed_refs->lock);
2470                 return 1;
2471         }
2472         btrfs_delete_ref_head(delayed_refs, head);
2473         spin_unlock(&head->lock);
2474         spin_unlock(&delayed_refs->lock);
2475
2476         if (head->must_insert_reserved) {
2477                 btrfs_pin_extent(fs_info, head->bytenr,
2478                                  head->num_bytes, 1);
2479                 if (head->is_data) {
2480                         ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2481                                               head->num_bytes);
2482                 }
2483         }
2484
2485         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2486
2487         trace_run_delayed_ref_head(fs_info, head, 0);
2488         btrfs_delayed_ref_unlock(head);
2489         btrfs_put_delayed_ref_head(head);
2490         return 0;
2491 }
2492
2493 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2494                                         struct btrfs_trans_handle *trans)
2495 {
2496         struct btrfs_delayed_ref_root *delayed_refs =
2497                 &trans->transaction->delayed_refs;
2498         struct btrfs_delayed_ref_head *head = NULL;
2499         int ret;
2500
2501         spin_lock(&delayed_refs->lock);
2502         head = btrfs_select_ref_head(delayed_refs);
2503         if (!head) {
2504                 spin_unlock(&delayed_refs->lock);
2505                 return head;
2506         }
2507
2508         /*
2509          * Grab the lock that says we are going to process all the refs for
2510          * this head
2511          */
2512         ret = btrfs_delayed_ref_lock(delayed_refs, head);
2513         spin_unlock(&delayed_refs->lock);
2514
2515         /*
2516          * We may have dropped the spin lock to get the head mutex lock, and
2517          * that might have given someone else time to free the head.  If that's
2518          * true, it has been removed from our list and we can move on.
2519          */
2520         if (ret == -EAGAIN)
2521                 head = ERR_PTR(-EAGAIN);
2522
2523         return head;
2524 }
2525
2526 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2527                                     struct btrfs_delayed_ref_head *locked_ref,
2528                                     unsigned long *run_refs)
2529 {
2530         struct btrfs_fs_info *fs_info = trans->fs_info;
2531         struct btrfs_delayed_ref_root *delayed_refs;
2532         struct btrfs_delayed_extent_op *extent_op;
2533         struct btrfs_delayed_ref_node *ref;
2534         int must_insert_reserved = 0;
2535         int ret;
2536
2537         delayed_refs = &trans->transaction->delayed_refs;
2538
2539         lockdep_assert_held(&locked_ref->mutex);
2540         lockdep_assert_held(&locked_ref->lock);
2541
2542         while ((ref = select_delayed_ref(locked_ref))) {
2543                 if (ref->seq &&
2544                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
2545                         spin_unlock(&locked_ref->lock);
2546                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2547                         return -EAGAIN;
2548                 }
2549
2550                 (*run_refs)++;
2551                 ref->in_tree = 0;
2552                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2553                 RB_CLEAR_NODE(&ref->ref_node);
2554                 if (!list_empty(&ref->add_list))
2555                         list_del(&ref->add_list);
2556                 /*
2557                  * When we play the delayed ref, also correct the ref_mod on
2558                  * head
2559                  */
2560                 switch (ref->action) {
2561                 case BTRFS_ADD_DELAYED_REF:
2562                 case BTRFS_ADD_DELAYED_EXTENT:
2563                         locked_ref->ref_mod -= ref->ref_mod;
2564                         break;
2565                 case BTRFS_DROP_DELAYED_REF:
2566                         locked_ref->ref_mod += ref->ref_mod;
2567                         break;
2568                 default:
2569                         WARN_ON(1);
2570                 }
2571                 atomic_dec(&delayed_refs->num_entries);
2572
2573                 /*
2574                  * Record the must_insert_reserved flag before we drop the
2575                  * spin lock.
2576                  */
2577                 must_insert_reserved = locked_ref->must_insert_reserved;
2578                 locked_ref->must_insert_reserved = 0;
2579
2580                 extent_op = locked_ref->extent_op;
2581                 locked_ref->extent_op = NULL;
2582                 spin_unlock(&locked_ref->lock);
2583
2584                 ret = run_one_delayed_ref(trans, ref, extent_op,
2585                                           must_insert_reserved);
2586
2587                 btrfs_free_delayed_extent_op(extent_op);
2588                 if (ret) {
2589                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2590                         btrfs_put_delayed_ref(ref);
2591                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2592                                     ret);
2593                         return ret;
2594                 }
2595
2596                 btrfs_put_delayed_ref(ref);
2597                 cond_resched();
2598
2599                 spin_lock(&locked_ref->lock);
2600                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2601         }
2602
2603         return 0;
2604 }
2605
2606 /*
2607  * Returns 0 on success or if called with an already aborted transaction.
2608  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2609  */
2610 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2611                                              unsigned long nr)
2612 {
2613         struct btrfs_fs_info *fs_info = trans->fs_info;
2614         struct btrfs_delayed_ref_root *delayed_refs;
2615         struct btrfs_delayed_ref_head *locked_ref = NULL;
2616         ktime_t start = ktime_get();
2617         int ret;
2618         unsigned long count = 0;
2619         unsigned long actual_count = 0;
2620
2621         delayed_refs = &trans->transaction->delayed_refs;
2622         do {
2623                 if (!locked_ref) {
2624                         locked_ref = btrfs_obtain_ref_head(trans);
2625                         if (IS_ERR_OR_NULL(locked_ref)) {
2626                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2627                                         continue;
2628                                 } else {
2629                                         break;
2630                                 }
2631                         }
2632                         count++;
2633                 }
2634                 /*
2635                  * We need to try and merge add/drops of the same ref since we
2636                  * can run into issues with relocate dropping the implicit ref
2637                  * and then it being added back again before the drop can
2638                  * finish.  If we merged anything we need to re-loop so we can
2639                  * get a good ref.
2640                  * Or we can get node references of the same type that weren't
2641                  * merged when created due to bumps in the tree mod seq, and
2642                  * we need to merge them to prevent adding an inline extent
2643                  * backref before dropping it (triggering a BUG_ON at
2644                  * insert_inline_extent_backref()).
2645                  */
2646                 spin_lock(&locked_ref->lock);
2647                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2648
2649                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2650                                                       &actual_count);
2651                 if (ret < 0 && ret != -EAGAIN) {
2652                         /*
2653                          * Error, btrfs_run_delayed_refs_for_head already
2654                          * unlocked everything so just bail out
2655                          */
2656                         return ret;
2657                 } else if (!ret) {
2658                         /*
2659                          * Success, perform the usual cleanup of a processed
2660                          * head
2661                          */
2662                         ret = cleanup_ref_head(trans, locked_ref);
2663                         if (ret > 0 ) {
2664                                 /* We dropped our lock, we need to loop. */
2665                                 ret = 0;
2666                                 continue;
2667                         } else if (ret) {
2668                                 return ret;
2669                         }
2670                 }
2671
2672                 /*
2673                  * Either success case or btrfs_run_delayed_refs_for_head
2674                  * returned -EAGAIN, meaning we need to select another head
2675                  */
2676
2677                 locked_ref = NULL;
2678                 cond_resched();
2679         } while ((nr != -1 && count < nr) || locked_ref);
2680
2681         /*
2682          * We don't want to include ref heads since we can have empty ref heads
2683          * and those will drastically skew our runtime down since we just do
2684          * accounting, no actual extent tree updates.
2685          */
2686         if (actual_count > 0) {
2687                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2688                 u64 avg;
2689
2690                 /*
2691                  * We weigh the current average higher than our current runtime
2692                  * to avoid large swings in the average.
2693                  */
2694                 spin_lock(&delayed_refs->lock);
2695                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2696                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2697                 spin_unlock(&delayed_refs->lock);
2698         }
2699         return 0;
2700 }
2701
2702 #ifdef SCRAMBLE_DELAYED_REFS
2703 /*
2704  * Normally delayed refs get processed in ascending bytenr order. This
2705  * correlates in most cases to the order added. To expose dependencies on this
2706  * order, we start to process the tree in the middle instead of the beginning
2707  */
2708 static u64 find_middle(struct rb_root *root)
2709 {
2710         struct rb_node *n = root->rb_node;
2711         struct btrfs_delayed_ref_node *entry;
2712         int alt = 1;
2713         u64 middle;
2714         u64 first = 0, last = 0;
2715
2716         n = rb_first(root);
2717         if (n) {
2718                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2719                 first = entry->bytenr;
2720         }
2721         n = rb_last(root);
2722         if (n) {
2723                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2724                 last = entry->bytenr;
2725         }
2726         n = root->rb_node;
2727
2728         while (n) {
2729                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2730                 WARN_ON(!entry->in_tree);
2731
2732                 middle = entry->bytenr;
2733
2734                 if (alt)
2735                         n = n->rb_left;
2736                 else
2737                         n = n->rb_right;
2738
2739                 alt = 1 - alt;
2740         }
2741         return middle;
2742 }
2743 #endif
2744
2745 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2746 {
2747         u64 num_bytes;
2748
2749         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2750                              sizeof(struct btrfs_extent_inline_ref));
2751         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2752                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2753
2754         /*
2755          * We don't ever fill up leaves all the way so multiply by 2 just to be
2756          * closer to what we're really going to want to use.
2757          */
2758         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2759 }
2760
2761 /*
2762  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2763  * would require to store the csums for that many bytes.
2764  */
2765 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2766 {
2767         u64 csum_size;
2768         u64 num_csums_per_leaf;
2769         u64 num_csums;
2770
2771         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2772         num_csums_per_leaf = div64_u64(csum_size,
2773                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2774         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2775         num_csums += num_csums_per_leaf - 1;
2776         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2777         return num_csums;
2778 }
2779
2780 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2781 {
2782         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2783         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2784         bool ret = false;
2785         u64 reserved;
2786
2787         spin_lock(&global_rsv->lock);
2788         reserved = global_rsv->reserved;
2789         spin_unlock(&global_rsv->lock);
2790
2791         /*
2792          * Since the global reserve is just kind of magic we don't really want
2793          * to rely on it to save our bacon, so if our size is more than the
2794          * delayed_refs_rsv and the global rsv then it's time to think about
2795          * bailing.
2796          */
2797         spin_lock(&delayed_refs_rsv->lock);
2798         reserved += delayed_refs_rsv->reserved;
2799         if (delayed_refs_rsv->size >= reserved)
2800                 ret = true;
2801         spin_unlock(&delayed_refs_rsv->lock);
2802         return ret;
2803 }
2804
2805 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2806 {
2807         u64 num_entries =
2808                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2809         u64 avg_runtime;
2810         u64 val;
2811
2812         smp_mb();
2813         avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2814         val = num_entries * avg_runtime;
2815         if (val >= NSEC_PER_SEC)
2816                 return 1;
2817         if (val >= NSEC_PER_SEC / 2)
2818                 return 2;
2819
2820         return btrfs_check_space_for_delayed_refs(trans->fs_info);
2821 }
2822
2823 /*
2824  * this starts processing the delayed reference count updates and
2825  * extent insertions we have queued up so far.  count can be
2826  * 0, which means to process everything in the tree at the start
2827  * of the run (but not newly added entries), or it can be some target
2828  * number you'd like to process.
2829  *
2830  * Returns 0 on success or if called with an aborted transaction
2831  * Returns <0 on error and aborts the transaction
2832  */
2833 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2834                            unsigned long count)
2835 {
2836         struct btrfs_fs_info *fs_info = trans->fs_info;
2837         struct rb_node *node;
2838         struct btrfs_delayed_ref_root *delayed_refs;
2839         struct btrfs_delayed_ref_head *head;
2840         int ret;
2841         int run_all = count == (unsigned long)-1;
2842
2843         /* We'll clean this up in btrfs_cleanup_transaction */
2844         if (trans->aborted)
2845                 return 0;
2846
2847         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2848                 return 0;
2849
2850         delayed_refs = &trans->transaction->delayed_refs;
2851         if (count == 0)
2852                 count = atomic_read(&delayed_refs->num_entries) * 2;
2853
2854 again:
2855 #ifdef SCRAMBLE_DELAYED_REFS
2856         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2857 #endif
2858         ret = __btrfs_run_delayed_refs(trans, count);
2859         if (ret < 0) {
2860                 btrfs_abort_transaction(trans, ret);
2861                 return ret;
2862         }
2863
2864         if (run_all) {
2865                 btrfs_create_pending_block_groups(trans);
2866
2867                 spin_lock(&delayed_refs->lock);
2868                 node = rb_first_cached(&delayed_refs->href_root);
2869                 if (!node) {
2870                         spin_unlock(&delayed_refs->lock);
2871                         goto out;
2872                 }
2873                 head = rb_entry(node, struct btrfs_delayed_ref_head,
2874                                 href_node);
2875                 refcount_inc(&head->refs);
2876                 spin_unlock(&delayed_refs->lock);
2877
2878                 /* Mutex was contended, block until it's released and retry. */
2879                 mutex_lock(&head->mutex);
2880                 mutex_unlock(&head->mutex);
2881
2882                 btrfs_put_delayed_ref_head(head);
2883                 cond_resched();
2884                 goto again;
2885         }
2886 out:
2887         return 0;
2888 }
2889
2890 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2891                                 u64 bytenr, u64 num_bytes, u64 flags,
2892                                 int level, int is_data)
2893 {
2894         struct btrfs_delayed_extent_op *extent_op;
2895         int ret;
2896
2897         extent_op = btrfs_alloc_delayed_extent_op();
2898         if (!extent_op)
2899                 return -ENOMEM;
2900
2901         extent_op->flags_to_set = flags;
2902         extent_op->update_flags = true;
2903         extent_op->update_key = false;
2904         extent_op->is_data = is_data ? true : false;
2905         extent_op->level = level;
2906
2907         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2908         if (ret)
2909                 btrfs_free_delayed_extent_op(extent_op);
2910         return ret;
2911 }
2912
2913 static noinline int check_delayed_ref(struct btrfs_root *root,
2914                                       struct btrfs_path *path,
2915                                       u64 objectid, u64 offset, u64 bytenr)
2916 {
2917         struct btrfs_delayed_ref_head *head;
2918         struct btrfs_delayed_ref_node *ref;
2919         struct btrfs_delayed_data_ref *data_ref;
2920         struct btrfs_delayed_ref_root *delayed_refs;
2921         struct btrfs_transaction *cur_trans;
2922         struct rb_node *node;
2923         int ret = 0;
2924
2925         spin_lock(&root->fs_info->trans_lock);
2926         cur_trans = root->fs_info->running_transaction;
2927         if (cur_trans)
2928                 refcount_inc(&cur_trans->use_count);
2929         spin_unlock(&root->fs_info->trans_lock);
2930         if (!cur_trans)
2931                 return 0;
2932
2933         delayed_refs = &cur_trans->delayed_refs;
2934         spin_lock(&delayed_refs->lock);
2935         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2936         if (!head) {
2937                 spin_unlock(&delayed_refs->lock);
2938                 btrfs_put_transaction(cur_trans);
2939                 return 0;
2940         }
2941
2942         if (!mutex_trylock(&head->mutex)) {
2943                 refcount_inc(&head->refs);
2944                 spin_unlock(&delayed_refs->lock);
2945
2946                 btrfs_release_path(path);
2947
2948                 /*
2949                  * Mutex was contended, block until it's released and let
2950                  * caller try again
2951                  */
2952                 mutex_lock(&head->mutex);
2953                 mutex_unlock(&head->mutex);
2954                 btrfs_put_delayed_ref_head(head);
2955                 btrfs_put_transaction(cur_trans);
2956                 return -EAGAIN;
2957         }
2958         spin_unlock(&delayed_refs->lock);
2959
2960         spin_lock(&head->lock);
2961         /*
2962          * XXX: We should replace this with a proper search function in the
2963          * future.
2964          */
2965         for (node = rb_first_cached(&head->ref_tree); node;
2966              node = rb_next(node)) {
2967                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2968                 /* If it's a shared ref we know a cross reference exists */
2969                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2970                         ret = 1;
2971                         break;
2972                 }
2973
2974                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2975
2976                 /*
2977                  * If our ref doesn't match the one we're currently looking at
2978                  * then we have a cross reference.
2979                  */
2980                 if (data_ref->root != root->root_key.objectid ||
2981                     data_ref->objectid != objectid ||
2982                     data_ref->offset != offset) {
2983                         ret = 1;
2984                         break;
2985                 }
2986         }
2987         spin_unlock(&head->lock);
2988         mutex_unlock(&head->mutex);
2989         btrfs_put_transaction(cur_trans);
2990         return ret;
2991 }
2992
2993 static noinline int check_committed_ref(struct btrfs_root *root,
2994                                         struct btrfs_path *path,
2995                                         u64 objectid, u64 offset, u64 bytenr)
2996 {
2997         struct btrfs_fs_info *fs_info = root->fs_info;
2998         struct btrfs_root *extent_root = fs_info->extent_root;
2999         struct extent_buffer *leaf;
3000         struct btrfs_extent_data_ref *ref;
3001         struct btrfs_extent_inline_ref *iref;
3002         struct btrfs_extent_item *ei;
3003         struct btrfs_key key;
3004         u32 item_size;
3005         int type;
3006         int ret;
3007
3008         key.objectid = bytenr;
3009         key.offset = (u64)-1;
3010         key.type = BTRFS_EXTENT_ITEM_KEY;
3011
3012         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3013         if (ret < 0)
3014                 goto out;
3015         BUG_ON(ret == 0); /* Corruption */
3016
3017         ret = -ENOENT;
3018         if (path->slots[0] == 0)
3019                 goto out;
3020
3021         path->slots[0]--;
3022         leaf = path->nodes[0];
3023         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3024
3025         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3026                 goto out;
3027
3028         ret = 1;
3029         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3030         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3031
3032         if (item_size != sizeof(*ei) +
3033             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3034                 goto out;
3035
3036         if (btrfs_extent_generation(leaf, ei) <=
3037             btrfs_root_last_snapshot(&root->root_item))
3038                 goto out;
3039
3040         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3041
3042         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3043         if (type != BTRFS_EXTENT_DATA_REF_KEY)
3044                 goto out;
3045
3046         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3047         if (btrfs_extent_refs(leaf, ei) !=
3048             btrfs_extent_data_ref_count(leaf, ref) ||
3049             btrfs_extent_data_ref_root(leaf, ref) !=
3050             root->root_key.objectid ||
3051             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3052             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3053                 goto out;
3054
3055         ret = 0;
3056 out:
3057         return ret;
3058 }
3059
3060 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3061                           u64 bytenr)
3062 {
3063         struct btrfs_path *path;
3064         int ret;
3065
3066         path = btrfs_alloc_path();
3067         if (!path)
3068                 return -ENOMEM;
3069
3070         do {
3071                 ret = check_committed_ref(root, path, objectid,
3072                                           offset, bytenr);
3073                 if (ret && ret != -ENOENT)
3074                         goto out;
3075
3076                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3077         } while (ret == -EAGAIN);
3078
3079 out:
3080         btrfs_free_path(path);
3081         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3082                 WARN_ON(ret > 0);
3083         return ret;
3084 }
3085
3086 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3087                            struct btrfs_root *root,
3088                            struct extent_buffer *buf,
3089                            int full_backref, int inc)
3090 {
3091         struct btrfs_fs_info *fs_info = root->fs_info;
3092         u64 bytenr;
3093         u64 num_bytes;
3094         u64 parent;
3095         u64 ref_root;
3096         u32 nritems;
3097         struct btrfs_key key;
3098         struct btrfs_file_extent_item *fi;
3099         struct btrfs_ref generic_ref = { 0 };
3100         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
3101         int i;
3102         int action;
3103         int level;
3104         int ret = 0;
3105
3106         if (btrfs_is_testing(fs_info))
3107                 return 0;
3108
3109         ref_root = btrfs_header_owner(buf);
3110         nritems = btrfs_header_nritems(buf);
3111         level = btrfs_header_level(buf);
3112
3113         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3114                 return 0;
3115
3116         if (full_backref)
3117                 parent = buf->start;
3118         else
3119                 parent = 0;
3120         if (inc)
3121                 action = BTRFS_ADD_DELAYED_REF;
3122         else
3123                 action = BTRFS_DROP_DELAYED_REF;
3124
3125         for (i = 0; i < nritems; i++) {
3126                 if (level == 0) {
3127                         btrfs_item_key_to_cpu(buf, &key, i);
3128                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3129                                 continue;
3130                         fi = btrfs_item_ptr(buf, i,
3131                                             struct btrfs_file_extent_item);
3132                         if (btrfs_file_extent_type(buf, fi) ==
3133                             BTRFS_FILE_EXTENT_INLINE)
3134                                 continue;
3135                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3136                         if (bytenr == 0)
3137                                 continue;
3138
3139                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3140                         key.offset -= btrfs_file_extent_offset(buf, fi);
3141                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
3142                                                num_bytes, parent);
3143                         generic_ref.real_root = root->root_key.objectid;
3144                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
3145                                             key.offset);
3146                         generic_ref.skip_qgroup = for_reloc;
3147                         if (inc)
3148                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3149                         else
3150                                 ret = btrfs_free_extent(trans, &generic_ref);
3151                         if (ret)
3152                                 goto fail;
3153                 } else {
3154                         bytenr = btrfs_node_blockptr(buf, i);
3155                         num_bytes = fs_info->nodesize;
3156                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
3157                                                num_bytes, parent);
3158                         generic_ref.real_root = root->root_key.objectid;
3159                         btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
3160                         generic_ref.skip_qgroup = for_reloc;
3161                         if (inc)
3162                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
3163                         else
3164                                 ret = btrfs_free_extent(trans, &generic_ref);
3165                         if (ret)
3166                                 goto fail;
3167                 }
3168         }
3169         return 0;
3170 fail:
3171         return ret;
3172 }
3173
3174 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3175                   struct extent_buffer *buf, int full_backref)
3176 {
3177         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3178 }
3179
3180 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3181                   struct extent_buffer *buf, int full_backref)
3182 {
3183         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3184 }
3185
3186 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3187                                  struct btrfs_path *path,
3188                                  struct btrfs_block_group_cache *cache)
3189 {
3190         struct btrfs_fs_info *fs_info = trans->fs_info;
3191         int ret;
3192         struct btrfs_root *extent_root = fs_info->extent_root;
3193         unsigned long bi;
3194         struct extent_buffer *leaf;
3195
3196         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3197         if (ret) {
3198                 if (ret > 0)
3199                         ret = -ENOENT;
3200                 goto fail;
3201         }
3202
3203         leaf = path->nodes[0];
3204         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3205         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3206         btrfs_mark_buffer_dirty(leaf);
3207 fail:
3208         btrfs_release_path(path);
3209         return ret;
3210
3211 }
3212
3213 static struct btrfs_block_group_cache *next_block_group(
3214                 struct btrfs_block_group_cache *cache)
3215 {
3216         struct btrfs_fs_info *fs_info = cache->fs_info;
3217         struct rb_node *node;
3218
3219         spin_lock(&fs_info->block_group_cache_lock);
3220
3221         /* If our block group was removed, we need a full search. */
3222         if (RB_EMPTY_NODE(&cache->cache_node)) {
3223                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3224
3225                 spin_unlock(&fs_info->block_group_cache_lock);
3226                 btrfs_put_block_group(cache);
3227                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3228         }
3229         node = rb_next(&cache->cache_node);
3230         btrfs_put_block_group(cache);
3231         if (node) {
3232                 cache = rb_entry(node, struct btrfs_block_group_cache,
3233                                  cache_node);
3234                 btrfs_get_block_group(cache);
3235         } else
3236                 cache = NULL;
3237         spin_unlock(&fs_info->block_group_cache_lock);
3238         return cache;
3239 }
3240
3241 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3242                             struct btrfs_trans_handle *trans,
3243                             struct btrfs_path *path)
3244 {
3245         struct btrfs_fs_info *fs_info = block_group->fs_info;
3246         struct btrfs_root *root = fs_info->tree_root;
3247         struct inode *inode = NULL;
3248         struct extent_changeset *data_reserved = NULL;
3249         u64 alloc_hint = 0;
3250         int dcs = BTRFS_DC_ERROR;
3251         u64 num_pages = 0;
3252         int retries = 0;
3253         int ret = 0;
3254
3255         /*
3256          * If this block group is smaller than 100 megs don't bother caching the
3257          * block group.
3258          */
3259         if (block_group->key.offset < (100 * SZ_1M)) {
3260                 spin_lock(&block_group->lock);
3261                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3262                 spin_unlock(&block_group->lock);
3263                 return 0;
3264         }
3265
3266         if (trans->aborted)
3267                 return 0;
3268 again:
3269         inode = lookup_free_space_inode(block_group, path);
3270         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3271                 ret = PTR_ERR(inode);
3272                 btrfs_release_path(path);
3273                 goto out;
3274         }
3275
3276         if (IS_ERR(inode)) {
3277                 BUG_ON(retries);
3278                 retries++;
3279
3280                 if (block_group->ro)
3281                         goto out_free;
3282
3283                 ret = create_free_space_inode(trans, block_group, path);
3284                 if (ret)
3285                         goto out_free;
3286                 goto again;
3287         }
3288
3289         /*
3290          * We want to set the generation to 0, that way if anything goes wrong
3291          * from here on out we know not to trust this cache when we load up next
3292          * time.
3293          */
3294         BTRFS_I(inode)->generation = 0;
3295         ret = btrfs_update_inode(trans, root, inode);
3296         if (ret) {
3297                 /*
3298                  * So theoretically we could recover from this, simply set the
3299                  * super cache generation to 0 so we know to invalidate the
3300                  * cache, but then we'd have to keep track of the block groups
3301                  * that fail this way so we know we _have_ to reset this cache
3302                  * before the next commit or risk reading stale cache.  So to
3303                  * limit our exposure to horrible edge cases lets just abort the
3304                  * transaction, this only happens in really bad situations
3305                  * anyway.
3306                  */
3307                 btrfs_abort_transaction(trans, ret);
3308                 goto out_put;
3309         }
3310         WARN_ON(ret);
3311
3312         /* We've already setup this transaction, go ahead and exit */
3313         if (block_group->cache_generation == trans->transid &&
3314             i_size_read(inode)) {
3315                 dcs = BTRFS_DC_SETUP;
3316                 goto out_put;
3317         }
3318
3319         if (i_size_read(inode) > 0) {
3320                 ret = btrfs_check_trunc_cache_free_space(fs_info,
3321                                         &fs_info->global_block_rsv);
3322                 if (ret)
3323                         goto out_put;
3324
3325                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3326                 if (ret)
3327                         goto out_put;
3328         }
3329
3330         spin_lock(&block_group->lock);
3331         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3332             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3333                 /*
3334                  * don't bother trying to write stuff out _if_
3335                  * a) we're not cached,
3336                  * b) we're with nospace_cache mount option,
3337                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
3338                  */
3339                 dcs = BTRFS_DC_WRITTEN;
3340                 spin_unlock(&block_group->lock);
3341                 goto out_put;
3342         }
3343         spin_unlock(&block_group->lock);
3344
3345         /*
3346          * We hit an ENOSPC when setting up the cache in this transaction, just
3347          * skip doing the setup, we've already cleared the cache so we're safe.
3348          */
3349         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3350                 ret = -ENOSPC;
3351                 goto out_put;
3352         }
3353
3354         /*
3355          * Try to preallocate enough space based on how big the block group is.
3356          * Keep in mind this has to include any pinned space which could end up
3357          * taking up quite a bit since it's not folded into the other space
3358          * cache.
3359          */
3360         num_pages = div_u64(block_group->key.offset, SZ_256M);
3361         if (!num_pages)
3362                 num_pages = 1;
3363
3364         num_pages *= 16;
3365         num_pages *= PAGE_SIZE;
3366
3367         ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3368         if (ret)
3369                 goto out_put;
3370
3371         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3372                                               num_pages, num_pages,
3373                                               &alloc_hint);
3374         /*
3375          * Our cache requires contiguous chunks so that we don't modify a bunch
3376          * of metadata or split extents when writing the cache out, which means
3377          * we can enospc if we are heavily fragmented in addition to just normal
3378          * out of space conditions.  So if we hit this just skip setting up any
3379          * other block groups for this transaction, maybe we'll unpin enough
3380          * space the next time around.
3381          */
3382         if (!ret)
3383                 dcs = BTRFS_DC_SETUP;
3384         else if (ret == -ENOSPC)
3385                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3386
3387 out_put:
3388         iput(inode);
3389 out_free:
3390         btrfs_release_path(path);
3391 out:
3392         spin_lock(&block_group->lock);
3393         if (!ret && dcs == BTRFS_DC_SETUP)
3394                 block_group->cache_generation = trans->transid;
3395         block_group->disk_cache_state = dcs;
3396         spin_unlock(&block_group->lock);
3397
3398         extent_changeset_free(data_reserved);
3399         return ret;
3400 }
3401
3402 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3403 {
3404         struct btrfs_fs_info *fs_info = trans->fs_info;
3405         struct btrfs_block_group_cache *cache, *tmp;
3406         struct btrfs_transaction *cur_trans = trans->transaction;
3407         struct btrfs_path *path;
3408
3409         if (list_empty(&cur_trans->dirty_bgs) ||
3410             !btrfs_test_opt(fs_info, SPACE_CACHE))
3411                 return 0;
3412
3413         path = btrfs_alloc_path();
3414         if (!path)
3415                 return -ENOMEM;
3416
3417         /* Could add new block groups, use _safe just in case */
3418         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3419                                  dirty_list) {
3420                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3421                         cache_save_setup(cache, trans, path);
3422         }
3423
3424         btrfs_free_path(path);
3425         return 0;
3426 }
3427
3428 /*
3429  * transaction commit does final block group cache writeback during a
3430  * critical section where nothing is allowed to change the FS.  This is
3431  * required in order for the cache to actually match the block group,
3432  * but can introduce a lot of latency into the commit.
3433  *
3434  * So, btrfs_start_dirty_block_groups is here to kick off block group
3435  * cache IO.  There's a chance we'll have to redo some of it if the
3436  * block group changes again during the commit, but it greatly reduces
3437  * the commit latency by getting rid of the easy block groups while
3438  * we're still allowing others to join the commit.
3439  */
3440 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3441 {
3442         struct btrfs_fs_info *fs_info = trans->fs_info;
3443         struct btrfs_block_group_cache *cache;
3444         struct btrfs_transaction *cur_trans = trans->transaction;
3445         int ret = 0;
3446         int should_put;
3447         struct btrfs_path *path = NULL;
3448         LIST_HEAD(dirty);
3449         struct list_head *io = &cur_trans->io_bgs;
3450         int num_started = 0;
3451         int loops = 0;
3452
3453         spin_lock(&cur_trans->dirty_bgs_lock);
3454         if (list_empty(&cur_trans->dirty_bgs)) {
3455                 spin_unlock(&cur_trans->dirty_bgs_lock);
3456                 return 0;
3457         }
3458         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3459         spin_unlock(&cur_trans->dirty_bgs_lock);
3460
3461 again:
3462         /*
3463          * make sure all the block groups on our dirty list actually
3464          * exist
3465          */
3466         btrfs_create_pending_block_groups(trans);
3467
3468         if (!path) {
3469                 path = btrfs_alloc_path();
3470                 if (!path)
3471                         return -ENOMEM;
3472         }
3473
3474         /*
3475          * cache_write_mutex is here only to save us from balance or automatic
3476          * removal of empty block groups deleting this block group while we are
3477          * writing out the cache
3478          */
3479         mutex_lock(&trans->transaction->cache_write_mutex);
3480         while (!list_empty(&dirty)) {
3481                 bool drop_reserve = true;
3482
3483                 cache = list_first_entry(&dirty,
3484                                          struct btrfs_block_group_cache,
3485                                          dirty_list);
3486                 /*
3487                  * this can happen if something re-dirties a block
3488                  * group that is already under IO.  Just wait for it to
3489                  * finish and then do it all again
3490                  */
3491                 if (!list_empty(&cache->io_list)) {
3492                         list_del_init(&cache->io_list);
3493                         btrfs_wait_cache_io(trans, cache, path);
3494                         btrfs_put_block_group(cache);
3495                 }
3496
3497
3498                 /*
3499                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3500                  * if it should update the cache_state.  Don't delete
3501                  * until after we wait.
3502                  *
3503                  * Since we're not running in the commit critical section
3504                  * we need the dirty_bgs_lock to protect from update_block_group
3505                  */
3506                 spin_lock(&cur_trans->dirty_bgs_lock);
3507                 list_del_init(&cache->dirty_list);
3508                 spin_unlock(&cur_trans->dirty_bgs_lock);
3509
3510                 should_put = 1;
3511
3512                 cache_save_setup(cache, trans, path);
3513
3514                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3515                         cache->io_ctl.inode = NULL;
3516                         ret = btrfs_write_out_cache(trans, cache, path);
3517                         if (ret == 0 && cache->io_ctl.inode) {
3518                                 num_started++;
3519                                 should_put = 0;
3520
3521                                 /*
3522                                  * The cache_write_mutex is protecting the
3523                                  * io_list, also refer to the definition of
3524                                  * btrfs_transaction::io_bgs for more details
3525                                  */
3526                                 list_add_tail(&cache->io_list, io);
3527                         } else {
3528                                 /*
3529                                  * if we failed to write the cache, the
3530                                  * generation will be bad and life goes on
3531                                  */
3532                                 ret = 0;
3533                         }
3534                 }
3535                 if (!ret) {
3536                         ret = write_one_cache_group(trans, path, cache);
3537                         /*
3538                          * Our block group might still be attached to the list
3539                          * of new block groups in the transaction handle of some
3540                          * other task (struct btrfs_trans_handle->new_bgs). This
3541                          * means its block group item isn't yet in the extent
3542                          * tree. If this happens ignore the error, as we will
3543                          * try again later in the critical section of the
3544                          * transaction commit.
3545                          */
3546                         if (ret == -ENOENT) {
3547                                 ret = 0;
3548                                 spin_lock(&cur_trans->dirty_bgs_lock);
3549                                 if (list_empty(&cache->dirty_list)) {
3550                                         list_add_tail(&cache->dirty_list,
3551                                                       &cur_trans->dirty_bgs);
3552                                         btrfs_get_block_group(cache);
3553                                         drop_reserve = false;
3554                                 }
3555                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3556                         } else if (ret) {
3557                                 btrfs_abort_transaction(trans, ret);
3558                         }
3559                 }
3560
3561                 /* if it's not on the io list, we need to put the block group */
3562                 if (should_put)
3563                         btrfs_put_block_group(cache);
3564                 if (drop_reserve)
3565                         btrfs_delayed_refs_rsv_release(fs_info, 1);
3566
3567                 if (ret)
3568                         break;
3569
3570                 /*
3571                  * Avoid blocking other tasks for too long. It might even save
3572                  * us from writing caches for block groups that are going to be
3573                  * removed.
3574                  */
3575                 mutex_unlock(&trans->transaction->cache_write_mutex);
3576                 mutex_lock(&trans->transaction->cache_write_mutex);
3577         }
3578         mutex_unlock(&trans->transaction->cache_write_mutex);
3579
3580         /*
3581          * go through delayed refs for all the stuff we've just kicked off
3582          * and then loop back (just once)
3583          */
3584         ret = btrfs_run_delayed_refs(trans, 0);
3585         if (!ret && loops == 0) {
3586                 loops++;
3587                 spin_lock(&cur_trans->dirty_bgs_lock);
3588                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3589                 /*
3590                  * dirty_bgs_lock protects us from concurrent block group
3591                  * deletes too (not just cache_write_mutex).
3592                  */
3593                 if (!list_empty(&dirty)) {
3594                         spin_unlock(&cur_trans->dirty_bgs_lock);
3595                         goto again;
3596                 }
3597                 spin_unlock(&cur_trans->dirty_bgs_lock);
3598         } else if (ret < 0) {
3599                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3600         }
3601
3602         btrfs_free_path(path);
3603         return ret;
3604 }
3605
3606 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3607 {
3608         struct btrfs_fs_info *fs_info = trans->fs_info;
3609         struct btrfs_block_group_cache *cache;
3610         struct btrfs_transaction *cur_trans = trans->transaction;
3611         int ret = 0;
3612         int should_put;
3613         struct btrfs_path *path;
3614         struct list_head *io = &cur_trans->io_bgs;
3615         int num_started = 0;
3616
3617         path = btrfs_alloc_path();
3618         if (!path)
3619                 return -ENOMEM;
3620
3621         /*
3622          * Even though we are in the critical section of the transaction commit,
3623          * we can still have concurrent tasks adding elements to this
3624          * transaction's list of dirty block groups. These tasks correspond to
3625          * endio free space workers started when writeback finishes for a
3626          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3627          * allocate new block groups as a result of COWing nodes of the root
3628          * tree when updating the free space inode. The writeback for the space
3629          * caches is triggered by an earlier call to
3630          * btrfs_start_dirty_block_groups() and iterations of the following
3631          * loop.
3632          * Also we want to do the cache_save_setup first and then run the
3633          * delayed refs to make sure we have the best chance at doing this all
3634          * in one shot.
3635          */
3636         spin_lock(&cur_trans->dirty_bgs_lock);
3637         while (!list_empty(&cur_trans->dirty_bgs)) {
3638                 cache = list_first_entry(&cur_trans->dirty_bgs,
3639                                          struct btrfs_block_group_cache,
3640                                          dirty_list);
3641
3642                 /*
3643                  * this can happen if cache_save_setup re-dirties a block
3644                  * group that is already under IO.  Just wait for it to
3645                  * finish and then do it all again
3646                  */
3647                 if (!list_empty(&cache->io_list)) {
3648                         spin_unlock(&cur_trans->dirty_bgs_lock);
3649                         list_del_init(&cache->io_list);
3650                         btrfs_wait_cache_io(trans, cache, path);
3651                         btrfs_put_block_group(cache);
3652                         spin_lock(&cur_trans->dirty_bgs_lock);
3653                 }
3654
3655                 /*
3656                  * don't remove from the dirty list until after we've waited
3657                  * on any pending IO
3658                  */
3659                 list_del_init(&cache->dirty_list);
3660                 spin_unlock(&cur_trans->dirty_bgs_lock);
3661                 should_put = 1;
3662
3663                 cache_save_setup(cache, trans, path);
3664
3665                 if (!ret)
3666                         ret = btrfs_run_delayed_refs(trans,
3667                                                      (unsigned long) -1);
3668
3669                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3670                         cache->io_ctl.inode = NULL;
3671                         ret = btrfs_write_out_cache(trans, cache, path);
3672                         if (ret == 0 && cache->io_ctl.inode) {
3673                                 num_started++;
3674                                 should_put = 0;
3675                                 list_add_tail(&cache->io_list, io);
3676                         } else {
3677                                 /*
3678                                  * if we failed to write the cache, the
3679                                  * generation will be bad and life goes on
3680                                  */
3681                                 ret = 0;
3682                         }
3683                 }
3684                 if (!ret) {
3685                         ret = write_one_cache_group(trans, path, cache);
3686                         /*
3687                          * One of the free space endio workers might have
3688                          * created a new block group while updating a free space
3689                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3690                          * and hasn't released its transaction handle yet, in
3691                          * which case the new block group is still attached to
3692                          * its transaction handle and its creation has not
3693                          * finished yet (no block group item in the extent tree
3694                          * yet, etc). If this is the case, wait for all free
3695                          * space endio workers to finish and retry. This is a
3696                          * a very rare case so no need for a more efficient and
3697                          * complex approach.
3698                          */
3699                         if (ret == -ENOENT) {
3700                                 wait_event(cur_trans->writer_wait,
3701                                    atomic_read(&cur_trans->num_writers) == 1);
3702                                 ret = write_one_cache_group(trans, path, cache);
3703                         }
3704                         if (ret)
3705                                 btrfs_abort_transaction(trans, ret);
3706                 }
3707
3708                 /* if its not on the io list, we need to put the block group */
3709                 if (should_put)
3710                         btrfs_put_block_group(cache);
3711                 btrfs_delayed_refs_rsv_release(fs_info, 1);
3712                 spin_lock(&cur_trans->dirty_bgs_lock);
3713         }
3714         spin_unlock(&cur_trans->dirty_bgs_lock);
3715
3716         /*
3717          * Refer to the definition of io_bgs member for details why it's safe
3718          * to use it without any locking
3719          */
3720         while (!list_empty(io)) {
3721                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3722                                          io_list);
3723                 list_del_init(&cache->io_list);
3724                 btrfs_wait_cache_io(trans, cache, path);
3725                 btrfs_put_block_group(cache);
3726         }
3727
3728         btrfs_free_path(path);
3729         return ret;
3730 }
3731
3732 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3733 {
3734         struct btrfs_block_group_cache *block_group;
3735         int readonly = 0;
3736
3737         block_group = btrfs_lookup_block_group(fs_info, bytenr);
3738         if (!block_group || block_group->ro)
3739                 readonly = 1;
3740         if (block_group)
3741                 btrfs_put_block_group(block_group);
3742         return readonly;
3743 }
3744
3745 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3746 {
3747         struct btrfs_block_group_cache *bg;
3748         bool ret = true;
3749
3750         bg = btrfs_lookup_block_group(fs_info, bytenr);
3751         if (!bg)
3752                 return false;
3753
3754         spin_lock(&bg->lock);
3755         if (bg->ro)
3756                 ret = false;
3757         else
3758                 atomic_inc(&bg->nocow_writers);
3759         spin_unlock(&bg->lock);
3760
3761         /* no put on block group, done by btrfs_dec_nocow_writers */
3762         if (!ret)
3763                 btrfs_put_block_group(bg);
3764
3765         return ret;
3766
3767 }
3768
3769 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3770 {
3771         struct btrfs_block_group_cache *bg;
3772
3773         bg = btrfs_lookup_block_group(fs_info, bytenr);
3774         ASSERT(bg);
3775         if (atomic_dec_and_test(&bg->nocow_writers))
3776                 wake_up_var(&bg->nocow_writers);
3777         /*
3778          * Once for our lookup and once for the lookup done by a previous call
3779          * to btrfs_inc_nocow_writers()
3780          */
3781         btrfs_put_block_group(bg);
3782         btrfs_put_block_group(bg);
3783 }
3784
3785 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3786 {
3787         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3788 }
3789
3790 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3791 {
3792         u64 extra_flags = chunk_to_extended(flags) &
3793                                 BTRFS_EXTENDED_PROFILE_MASK;
3794
3795         write_seqlock(&fs_info->profiles_lock);
3796         if (flags & BTRFS_BLOCK_GROUP_DATA)
3797                 fs_info->avail_data_alloc_bits |= extra_flags;
3798         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3799                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3800         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3801                 fs_info->avail_system_alloc_bits |= extra_flags;
3802         write_sequnlock(&fs_info->profiles_lock);
3803 }
3804
3805 /*
3806  * returns target flags in extended format or 0 if restripe for this
3807  * chunk_type is not in progress
3808  *
3809  * should be called with balance_lock held
3810  */
3811 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3812 {
3813         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3814         u64 target = 0;
3815
3816         if (!bctl)
3817                 return 0;
3818
3819         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3820             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3821                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3822         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3823                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3824                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3825         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3826                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3827                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3828         }
3829
3830         return target;
3831 }
3832
3833 /*
3834  * @flags: available profiles in extended format (see ctree.h)
3835  *
3836  * Returns reduced profile in chunk format.  If profile changing is in
3837  * progress (either running or paused) picks the target profile (if it's
3838  * already available), otherwise falls back to plain reducing.
3839  */
3840 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
3841 {
3842         u64 num_devices = fs_info->fs_devices->rw_devices;
3843         u64 target;
3844         u64 raid_type;
3845         u64 allowed = 0;
3846
3847         /*
3848          * see if restripe for this chunk_type is in progress, if so
3849          * try to reduce to the target profile
3850          */
3851         spin_lock(&fs_info->balance_lock);
3852         target = get_restripe_target(fs_info, flags);
3853         if (target) {
3854                 /* pick target profile only if it's already available */
3855                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3856                         spin_unlock(&fs_info->balance_lock);
3857                         return extended_to_chunk(target);
3858                 }
3859         }
3860         spin_unlock(&fs_info->balance_lock);
3861
3862         /* First, mask out the RAID levels which aren't possible */
3863         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3864                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
3865                         allowed |= btrfs_raid_array[raid_type].bg_flag;
3866         }
3867         allowed &= flags;
3868
3869         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3870                 allowed = BTRFS_BLOCK_GROUP_RAID6;
3871         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3872                 allowed = BTRFS_BLOCK_GROUP_RAID5;
3873         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3874                 allowed = BTRFS_BLOCK_GROUP_RAID10;
3875         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3876                 allowed = BTRFS_BLOCK_GROUP_RAID1;
3877         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3878                 allowed = BTRFS_BLOCK_GROUP_RAID0;
3879
3880         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3881
3882         return extended_to_chunk(flags | allowed);
3883 }
3884
3885 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
3886 {
3887         unsigned seq;
3888         u64 flags;
3889
3890         do {
3891                 flags = orig_flags;
3892                 seq = read_seqbegin(&fs_info->profiles_lock);
3893
3894                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3895                         flags |= fs_info->avail_data_alloc_bits;
3896                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3897                         flags |= fs_info->avail_system_alloc_bits;
3898                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3899                         flags |= fs_info->avail_metadata_alloc_bits;
3900         } while (read_seqretry(&fs_info->profiles_lock, seq));
3901
3902         return btrfs_reduce_alloc_profile(fs_info, flags);
3903 }
3904
3905 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
3906 {
3907         struct btrfs_fs_info *fs_info = root->fs_info;
3908         u64 flags;
3909         u64 ret;
3910
3911         if (data)
3912                 flags = BTRFS_BLOCK_GROUP_DATA;
3913         else if (root == fs_info->chunk_root)
3914                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3915         else
3916                 flags = BTRFS_BLOCK_GROUP_METADATA;
3917
3918         ret = get_alloc_profile(fs_info, flags);
3919         return ret;
3920 }
3921
3922 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
3923 {
3924         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
3925 }
3926
3927 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
3928 {
3929         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3930 }
3931
3932 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
3933 {
3934         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3935 }
3936
3937 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
3938 {
3939         struct btrfs_root *root = inode->root;
3940         struct btrfs_fs_info *fs_info = root->fs_info;
3941         struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
3942         u64 used;
3943         int ret = 0;
3944         int need_commit = 2;
3945         int have_pinned_space;
3946
3947         /* make sure bytes are sectorsize aligned */
3948         bytes = ALIGN(bytes, fs_info->sectorsize);
3949
3950         if (btrfs_is_free_space_inode(inode)) {
3951                 need_commit = 0;
3952                 ASSERT(current->journal_info);
3953         }
3954
3955 again:
3956         /* make sure we have enough space to handle the data first */
3957         spin_lock(&data_sinfo->lock);
3958         used = btrfs_space_info_used(data_sinfo, true);
3959
3960         if (used + bytes > data_sinfo->total_bytes) {
3961                 struct btrfs_trans_handle *trans;
3962
3963                 /*
3964                  * if we don't have enough free bytes in this space then we need
3965                  * to alloc a new chunk.
3966                  */
3967                 if (!data_sinfo->full) {
3968                         u64 alloc_target;
3969
3970                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3971                         spin_unlock(&data_sinfo->lock);
3972
3973                         alloc_target = btrfs_data_alloc_profile(fs_info);
3974                         /*
3975                          * It is ugly that we don't call nolock join
3976                          * transaction for the free space inode case here.
3977                          * But it is safe because we only do the data space
3978                          * reservation for the free space cache in the
3979                          * transaction context, the common join transaction
3980                          * just increase the counter of the current transaction
3981                          * handler, doesn't try to acquire the trans_lock of
3982                          * the fs.
3983                          */
3984                         trans = btrfs_join_transaction(root);
3985                         if (IS_ERR(trans))
3986                                 return PTR_ERR(trans);
3987
3988                         ret = btrfs_chunk_alloc(trans, alloc_target,
3989                                                 CHUNK_ALLOC_NO_FORCE);
3990                         btrfs_end_transaction(trans);
3991                         if (ret < 0) {
3992                                 if (ret != -ENOSPC)
3993                                         return ret;
3994                                 else {
3995                                         have_pinned_space = 1;
3996                                         goto commit_trans;
3997                                 }
3998                         }
3999
4000                         goto again;
4001                 }
4002
4003                 /*
4004                  * If we don't have enough pinned space to deal with this
4005                  * allocation, and no removed chunk in current transaction,
4006                  * don't bother committing the transaction.
4007                  */
4008                 have_pinned_space = __percpu_counter_compare(
4009                         &data_sinfo->total_bytes_pinned,
4010                         used + bytes - data_sinfo->total_bytes,
4011                         BTRFS_TOTAL_BYTES_PINNED_BATCH);
4012                 spin_unlock(&data_sinfo->lock);
4013
4014                 /* commit the current transaction and try again */
4015 commit_trans:
4016                 if (need_commit) {
4017                         need_commit--;
4018
4019                         if (need_commit > 0) {
4020                                 btrfs_start_delalloc_roots(fs_info, -1);
4021                                 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4022                                                          (u64)-1);
4023                         }
4024
4025                         trans = btrfs_join_transaction(root);
4026                         if (IS_ERR(trans))
4027                                 return PTR_ERR(trans);
4028                         if (have_pinned_space >= 0 ||
4029                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4030                                      &trans->transaction->flags) ||
4031                             need_commit > 0) {
4032                                 ret = btrfs_commit_transaction(trans);
4033                                 if (ret)
4034                                         return ret;
4035                                 /*
4036                                  * The cleaner kthread might still be doing iput
4037                                  * operations. Wait for it to finish so that
4038                                  * more space is released.  We don't need to
4039                                  * explicitly run the delayed iputs here because
4040                                  * the commit_transaction would have woken up
4041                                  * the cleaner.
4042                                  */
4043                                 ret = btrfs_wait_on_delayed_iputs(fs_info);
4044                                 if (ret)
4045                                         return ret;
4046                                 goto again;
4047                         } else {
4048                                 btrfs_end_transaction(trans);
4049                         }
4050                 }
4051
4052                 trace_btrfs_space_reservation(fs_info,
4053                                               "space_info:enospc",
4054                                               data_sinfo->flags, bytes, 1);
4055                 return -ENOSPC;
4056         }
4057         update_bytes_may_use(fs_info, data_sinfo, bytes);
4058         trace_btrfs_space_reservation(fs_info, "space_info",
4059                                       data_sinfo->flags, bytes, 1);
4060         spin_unlock(&data_sinfo->lock);
4061
4062         return 0;
4063 }
4064
4065 int btrfs_check_data_free_space(struct inode *inode,
4066                         struct extent_changeset **reserved, u64 start, u64 len)
4067 {
4068         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4069         int ret;
4070
4071         /* align the range */
4072         len = round_up(start + len, fs_info->sectorsize) -
4073               round_down(start, fs_info->sectorsize);
4074         start = round_down(start, fs_info->sectorsize);
4075
4076         ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4077         if (ret < 0)
4078                 return ret;
4079
4080         /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4081         ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4082         if (ret < 0)
4083                 btrfs_free_reserved_data_space_noquota(inode, start, len);
4084         else
4085                 ret = 0;
4086         return ret;
4087 }
4088
4089 /*
4090  * Called if we need to clear a data reservation for this inode
4091  * Normally in a error case.
4092  *
4093  * This one will *NOT* use accurate qgroup reserved space API, just for case
4094  * which we can't sleep and is sure it won't affect qgroup reserved space.
4095  * Like clear_bit_hook().
4096  */
4097 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4098                                             u64 len)
4099 {
4100         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4101         struct btrfs_space_info *data_sinfo;
4102
4103         /* Make sure the range is aligned to sectorsize */
4104         len = round_up(start + len, fs_info->sectorsize) -
4105               round_down(start, fs_info->sectorsize);
4106         start = round_down(start, fs_info->sectorsize);
4107
4108         data_sinfo = fs_info->data_sinfo;
4109         spin_lock(&data_sinfo->lock);
4110         update_bytes_may_use(fs_info, data_sinfo, -len);
4111         trace_btrfs_space_reservation(fs_info, "space_info",
4112                                       data_sinfo->flags, len, 0);
4113         spin_unlock(&data_sinfo->lock);
4114 }
4115
4116 /*
4117  * Called if we need to clear a data reservation for this inode
4118  * Normally in a error case.
4119  *
4120  * This one will handle the per-inode data rsv map for accurate reserved
4121  * space framework.
4122  */
4123 void btrfs_free_reserved_data_space(struct inode *inode,
4124                         struct extent_changeset *reserved, u64 start, u64 len)
4125 {
4126         struct btrfs_root *root = BTRFS_I(inode)->root;
4127
4128         /* Make sure the range is aligned to sectorsize */
4129         len = round_up(start + len, root->fs_info->sectorsize) -
4130               round_down(start, root->fs_info->sectorsize);
4131         start = round_down(start, root->fs_info->sectorsize);
4132
4133         btrfs_free_reserved_data_space_noquota(inode, start, len);
4134         btrfs_qgroup_free_data(inode, reserved, start, len);
4135 }
4136
4137 static void force_metadata_allocation(struct btrfs_fs_info *info)
4138 {
4139         struct list_head *head = &info->space_info;
4140         struct btrfs_space_info *found;
4141
4142         rcu_read_lock();
4143         list_for_each_entry_rcu(found, head, list) {
4144                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4145                         found->force_alloc = CHUNK_ALLOC_FORCE;
4146         }
4147         rcu_read_unlock();
4148 }
4149
4150 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4151 {
4152         return (global->size << 1);
4153 }
4154
4155 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4156                               struct btrfs_space_info *sinfo, int force)
4157 {
4158         u64 bytes_used = btrfs_space_info_used(sinfo, false);
4159         u64 thresh;
4160
4161         if (force == CHUNK_ALLOC_FORCE)
4162                 return 1;
4163
4164         /*
4165          * in limited mode, we want to have some free space up to
4166          * about 1% of the FS size.
4167          */
4168         if (force == CHUNK_ALLOC_LIMITED) {
4169                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
4170                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4171
4172                 if (sinfo->total_bytes - bytes_used < thresh)
4173                         return 1;
4174         }
4175
4176         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4177                 return 0;
4178         return 1;
4179 }
4180
4181 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4182 {
4183         u64 num_dev;
4184
4185         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4186         if (!num_dev)
4187                 num_dev = fs_info->fs_devices->rw_devices;
4188
4189         return num_dev;
4190 }
4191
4192 /*
4193  * If @is_allocation is true, reserve space in the system space info necessary
4194  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4195  * removing a chunk.
4196  */
4197 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4198 {
4199         struct btrfs_fs_info *fs_info = trans->fs_info;
4200         struct btrfs_space_info *info;
4201         u64 left;
4202         u64 thresh;
4203         int ret = 0;
4204         u64 num_devs;
4205
4206         /*
4207          * Needed because we can end up allocating a system chunk and for an
4208          * atomic and race free space reservation in the chunk block reserve.
4209          */
4210         lockdep_assert_held(&fs_info->chunk_mutex);
4211
4212         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4213         spin_lock(&info->lock);
4214         left = info->total_bytes - btrfs_space_info_used(info, true);
4215         spin_unlock(&info->lock);
4216
4217         num_devs = get_profile_num_devs(fs_info, type);
4218
4219         /* num_devs device items to update and 1 chunk item to add or remove */
4220         thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4221                 btrfs_calc_trans_metadata_size(fs_info, 1);
4222
4223         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4224                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4225                            left, thresh, type);
4226                 dump_space_info(fs_info, info, 0, 0);
4227         }
4228
4229         if (left < thresh) {
4230                 u64 flags = btrfs_system_alloc_profile(fs_info);
4231
4232                 /*
4233                  * Ignore failure to create system chunk. We might end up not
4234                  * needing it, as we might not need to COW all nodes/leafs from
4235                  * the paths we visit in the chunk tree (they were already COWed
4236                  * or created in the current transaction for example).
4237                  */
4238                 ret = btrfs_alloc_chunk(trans, flags);
4239         }
4240
4241         if (!ret) {
4242                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4243                                           &fs_info->chunk_block_rsv,
4244                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4245                 if (!ret)
4246                         trans->chunk_bytes_reserved += thresh;
4247         }
4248 }
4249
4250 /*
4251  * If force is CHUNK_ALLOC_FORCE:
4252  *    - return 1 if it successfully allocates a chunk,
4253  *    - return errors including -ENOSPC otherwise.
4254  * If force is NOT CHUNK_ALLOC_FORCE:
4255  *    - return 0 if it doesn't need to allocate a new chunk,
4256  *    - return 1 if it successfully allocates a chunk,
4257  *    - return errors including -ENOSPC otherwise.
4258  */
4259 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4260                       enum btrfs_chunk_alloc_enum force)
4261 {
4262         struct btrfs_fs_info *fs_info = trans->fs_info;
4263         struct btrfs_space_info *space_info;
4264         bool wait_for_alloc = false;
4265         bool should_alloc = false;
4266         int ret = 0;
4267
4268         /* Don't re-enter if we're already allocating a chunk */
4269         if (trans->allocating_chunk)
4270                 return -ENOSPC;
4271
4272         space_info = btrfs_find_space_info(fs_info, flags);
4273         ASSERT(space_info);
4274
4275         do {
4276                 spin_lock(&space_info->lock);
4277                 if (force < space_info->force_alloc)
4278                         force = space_info->force_alloc;
4279                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4280                 if (space_info->full) {
4281                         /* No more free physical space */
4282                         if (should_alloc)
4283                                 ret = -ENOSPC;
4284                         else
4285                                 ret = 0;
4286                         spin_unlock(&space_info->lock);
4287                         return ret;
4288                 } else if (!should_alloc) {
4289                         spin_unlock(&space_info->lock);
4290                         return 0;
4291                 } else if (space_info->chunk_alloc) {
4292                         /*
4293                          * Someone is already allocating, so we need to block
4294                          * until this someone is finished and then loop to
4295                          * recheck if we should continue with our allocation
4296                          * attempt.
4297                          */
4298                         wait_for_alloc = true;
4299                         spin_unlock(&space_info->lock);
4300                         mutex_lock(&fs_info->chunk_mutex);
4301                         mutex_unlock(&fs_info->chunk_mutex);
4302                 } else {
4303                         /* Proceed with allocation */
4304                         space_info->chunk_alloc = 1;
4305                         wait_for_alloc = false;
4306                         spin_unlock(&space_info->lock);
4307                 }
4308
4309                 cond_resched();
4310         } while (wait_for_alloc);
4311
4312         mutex_lock(&fs_info->chunk_mutex);
4313         trans->allocating_chunk = true;
4314
4315         /*
4316          * If we have mixed data/metadata chunks we want to make sure we keep
4317          * allocating mixed chunks instead of individual chunks.
4318          */
4319         if (btrfs_mixed_space_info(space_info))
4320                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4321
4322         /*
4323          * if we're doing a data chunk, go ahead and make sure that
4324          * we keep a reasonable number of metadata chunks allocated in the
4325          * FS as well.
4326          */
4327         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4328                 fs_info->data_chunk_allocations++;
4329                 if (!(fs_info->data_chunk_allocations %
4330                       fs_info->metadata_ratio))
4331                         force_metadata_allocation(fs_info);
4332         }
4333
4334         /*
4335          * Check if we have enough space in SYSTEM chunk because we may need
4336          * to update devices.
4337          */
4338         check_system_chunk(trans, flags);
4339
4340         ret = btrfs_alloc_chunk(trans, flags);
4341         trans->allocating_chunk = false;
4342
4343         spin_lock(&space_info->lock);
4344         if (ret < 0) {
4345                 if (ret == -ENOSPC)
4346                         space_info->full = 1;
4347                 else
4348                         goto out;
4349         } else {
4350                 ret = 1;
4351                 space_info->max_extent_size = 0;
4352         }
4353
4354         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4355 out:
4356         space_info->chunk_alloc = 0;
4357         spin_unlock(&space_info->lock);
4358         mutex_unlock(&fs_info->chunk_mutex);
4359         /*
4360          * When we allocate a new chunk we reserve space in the chunk block
4361          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4362          * add new nodes/leafs to it if we end up needing to do it when
4363          * inserting the chunk item and updating device items as part of the
4364          * second phase of chunk allocation, performed by
4365          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4366          * large number of new block groups to create in our transaction
4367          * handle's new_bgs list to avoid exhausting the chunk block reserve
4368          * in extreme cases - like having a single transaction create many new
4369          * block groups when starting to write out the free space caches of all
4370          * the block groups that were made dirty during the lifetime of the
4371          * transaction.
4372          */
4373         if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4374                 btrfs_create_pending_block_groups(trans);
4375
4376         return ret;
4377 }
4378
4379 static int can_overcommit(struct btrfs_fs_info *fs_info,
4380                           struct btrfs_space_info *space_info, u64 bytes,
4381                           enum btrfs_reserve_flush_enum flush,
4382                           bool system_chunk)
4383 {
4384         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4385         u64 profile;
4386         u64 space_size;
4387         u64 avail;
4388         u64 used;
4389         int factor;
4390
4391         /* Don't overcommit when in mixed mode. */
4392         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4393                 return 0;
4394
4395         if (system_chunk)
4396                 profile = btrfs_system_alloc_profile(fs_info);
4397         else
4398                 profile = btrfs_metadata_alloc_profile(fs_info);
4399
4400         used = btrfs_space_info_used(space_info, false);
4401
4402         /*
4403          * We only want to allow over committing if we have lots of actual space
4404          * free, but if we don't have enough space to handle the global reserve
4405          * space then we could end up having a real enospc problem when trying
4406          * to allocate a chunk or some other such important allocation.
4407          */
4408         spin_lock(&global_rsv->lock);
4409         space_size = calc_global_rsv_need_space(global_rsv);
4410         spin_unlock(&global_rsv->lock);
4411         if (used + space_size >= space_info->total_bytes)
4412                 return 0;
4413
4414         used += space_info->bytes_may_use;
4415
4416         avail = atomic64_read(&fs_info->free_chunk_space);
4417
4418         /*
4419          * If we have dup, raid1 or raid10 then only half of the free
4420          * space is actually usable.  For raid56, the space info used
4421          * doesn't include the parity drive, so we don't have to
4422          * change the math
4423          */
4424         factor = btrfs_bg_type_to_factor(profile);
4425         avail = div_u64(avail, factor);
4426
4427         /*
4428          * If we aren't flushing all things, let us overcommit up to
4429          * 1/2th of the space. If we can flush, don't let us overcommit
4430          * too much, let it overcommit up to 1/8 of the space.
4431          */
4432         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4433                 avail >>= 3;
4434         else
4435                 avail >>= 1;
4436
4437         if (used + bytes < space_info->total_bytes + avail)
4438                 return 1;
4439         return 0;
4440 }
4441
4442 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4443                                          unsigned long nr_pages, int nr_items)
4444 {
4445         struct super_block *sb = fs_info->sb;
4446
4447         if (down_read_trylock(&sb->s_umount)) {
4448                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4449                 up_read(&sb->s_umount);
4450         } else {
4451                 /*
4452                  * We needn't worry the filesystem going from r/w to r/o though
4453                  * we don't acquire ->s_umount mutex, because the filesystem
4454                  * should guarantee the delalloc inodes list be empty after
4455                  * the filesystem is readonly(all dirty pages are written to
4456                  * the disk).
4457                  */
4458                 btrfs_start_delalloc_roots(fs_info, nr_items);
4459                 if (!current->journal_info)
4460                         btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4461         }
4462 }
4463
4464 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4465                                         u64 to_reclaim)
4466 {
4467         u64 bytes;
4468         u64 nr;
4469
4470         bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4471         nr = div64_u64(to_reclaim, bytes);
4472         if (!nr)
4473                 nr = 1;
4474         return nr;
4475 }
4476
4477 #define EXTENT_SIZE_PER_ITEM    SZ_256K
4478
4479 /*
4480  * shrink metadata reservation for delalloc
4481  */
4482 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4483                             u64 orig, bool wait_ordered)
4484 {
4485         struct btrfs_space_info *space_info;
4486         struct btrfs_trans_handle *trans;
4487         u64 delalloc_bytes;
4488         u64 dio_bytes;
4489         u64 async_pages;
4490         u64 items;
4491         long time_left;
4492         unsigned long nr_pages;
4493         int loops;
4494
4495         /* Calc the number of the pages we need flush for space reservation */
4496         items = calc_reclaim_items_nr(fs_info, to_reclaim);
4497         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4498
4499         trans = (struct btrfs_trans_handle *)current->journal_info;
4500         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4501
4502         delalloc_bytes = percpu_counter_sum_positive(
4503                                                 &fs_info->delalloc_bytes);
4504         dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
4505         if (delalloc_bytes == 0 && dio_bytes == 0) {
4506                 if (trans)
4507                         return;
4508                 if (wait_ordered)
4509                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4510                 return;
4511         }
4512
4513         /*
4514          * If we are doing more ordered than delalloc we need to just wait on
4515          * ordered extents, otherwise we'll waste time trying to flush delalloc
4516          * that likely won't give us the space back we need.
4517          */
4518         if (dio_bytes > delalloc_bytes)
4519                 wait_ordered = true;
4520
4521         loops = 0;
4522         while ((delalloc_bytes || dio_bytes) && loops < 3) {
4523                 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
4524
4525                 /*
4526                  * Triggers inode writeback for up to nr_pages. This will invoke
4527                  * ->writepages callback and trigger delalloc filling
4528                  *  (btrfs_run_delalloc_range()).
4529                  */
4530                 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4531
4532                 /*
4533                  * We need to wait for the compressed pages to start before
4534                  * we continue.
4535                  */
4536                 async_pages = atomic_read(&fs_info->async_delalloc_pages);
4537                 if (!async_pages)
4538                         goto skip_async;
4539
4540                 /*
4541                  * Calculate how many compressed pages we want to be written
4542                  * before we continue. I.e if there are more async pages than we
4543                  * require wait_event will wait until nr_pages are written.
4544                  */
4545                 if (async_pages <= nr_pages)
4546                         async_pages = 0;
4547                 else
4548                         async_pages -= nr_pages;
4549
4550                 wait_event(fs_info->async_submit_wait,
4551                            atomic_read(&fs_info->async_delalloc_pages) <=
4552                            (int)async_pages);
4553 skip_async:
4554                 spin_lock(&space_info->lock);
4555                 if (list_empty(&space_info->tickets) &&
4556                     list_empty(&space_info->priority_tickets)) {
4557                         spin_unlock(&space_info->lock);
4558                         break;
4559                 }
4560                 spin_unlock(&space_info->lock);
4561
4562                 loops++;
4563                 if (wait_ordered && !trans) {
4564                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4565                 } else {
4566                         time_left = schedule_timeout_killable(1);
4567                         if (time_left)
4568                                 break;
4569                 }
4570                 delalloc_bytes = percpu_counter_sum_positive(
4571                                                 &fs_info->delalloc_bytes);
4572                 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
4573         }
4574 }
4575
4576 struct reserve_ticket {
4577         u64 orig_bytes;
4578         u64 bytes;
4579         int error;
4580         struct list_head list;
4581         wait_queue_head_t wait;
4582 };
4583
4584 /**
4585  * maybe_commit_transaction - possibly commit the transaction if its ok to
4586  * @root - the root we're allocating for
4587  * @bytes - the number of bytes we want to reserve
4588  * @force - force the commit
4589  *
4590  * This will check to make sure that committing the transaction will actually
4591  * get us somewhere and then commit the transaction if it does.  Otherwise it
4592  * will return -ENOSPC.
4593  */
4594 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4595                                   struct btrfs_space_info *space_info)
4596 {
4597         struct reserve_ticket *ticket = NULL;
4598         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4599         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4600         struct btrfs_trans_handle *trans;
4601         u64 bytes_needed;
4602         u64 reclaim_bytes = 0;
4603
4604         trans = (struct btrfs_trans_handle *)current->journal_info;
4605         if (trans)
4606                 return -EAGAIN;
4607
4608         spin_lock(&space_info->lock);
4609         if (!list_empty(&space_info->priority_tickets))
4610                 ticket = list_first_entry(&space_info->priority_tickets,
4611                                           struct reserve_ticket, list);
4612         else if (!list_empty(&space_info->tickets))
4613                 ticket = list_first_entry(&space_info->tickets,
4614                                           struct reserve_ticket, list);
4615         bytes_needed = (ticket) ? ticket->bytes : 0;
4616         spin_unlock(&space_info->lock);
4617
4618         if (!bytes_needed)
4619                 return 0;
4620
4621         trans = btrfs_join_transaction(fs_info->extent_root);
4622         if (IS_ERR(trans))
4623                 return PTR_ERR(trans);
4624
4625         /*
4626          * See if there is enough pinned space to make this reservation, or if
4627          * we have block groups that are going to be freed, allowing us to
4628          * possibly do a chunk allocation the next loop through.
4629          */
4630         if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
4631             __percpu_counter_compare(&space_info->total_bytes_pinned,
4632                                      bytes_needed,
4633                                      BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4634                 goto commit;
4635
4636         /*
4637          * See if there is some space in the delayed insertion reservation for
4638          * this reservation.
4639          */
4640         if (space_info != delayed_rsv->space_info)
4641                 goto enospc;
4642
4643         spin_lock(&delayed_rsv->lock);
4644         reclaim_bytes += delayed_rsv->reserved;
4645         spin_unlock(&delayed_rsv->lock);
4646
4647         spin_lock(&delayed_refs_rsv->lock);
4648         reclaim_bytes += delayed_refs_rsv->reserved;
4649         spin_unlock(&delayed_refs_rsv->lock);
4650         if (reclaim_bytes >= bytes_needed)
4651                 goto commit;
4652         bytes_needed -= reclaim_bytes;
4653
4654         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4655                                    bytes_needed,
4656                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
4657                 goto enospc;
4658
4659 commit:
4660         return btrfs_commit_transaction(trans);
4661 enospc:
4662         btrfs_end_transaction(trans);
4663         return -ENOSPC;
4664 }
4665
4666 /*
4667  * Try to flush some data based on policy set by @state. This is only advisory
4668  * and may fail for various reasons. The caller is supposed to examine the
4669  * state of @space_info to detect the outcome.
4670  */
4671 static void flush_space(struct btrfs_fs_info *fs_info,
4672                        struct btrfs_space_info *space_info, u64 num_bytes,
4673                        int state)
4674 {
4675         struct btrfs_root *root = fs_info->extent_root;
4676         struct btrfs_trans_handle *trans;
4677         int nr;
4678         int ret = 0;
4679
4680         switch (state) {
4681         case FLUSH_DELAYED_ITEMS_NR:
4682         case FLUSH_DELAYED_ITEMS:
4683                 if (state == FLUSH_DELAYED_ITEMS_NR)
4684                         nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4685                 else
4686                         nr = -1;
4687
4688                 trans = btrfs_join_transaction(root);
4689                 if (IS_ERR(trans)) {
4690                         ret = PTR_ERR(trans);
4691                         break;
4692                 }
4693                 ret = btrfs_run_delayed_items_nr(trans, nr);
4694                 btrfs_end_transaction(trans);
4695                 break;
4696         case FLUSH_DELALLOC:
4697         case FLUSH_DELALLOC_WAIT:
4698                 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4699                                 state == FLUSH_DELALLOC_WAIT);
4700                 break;
4701         case FLUSH_DELAYED_REFS_NR:
4702         case FLUSH_DELAYED_REFS:
4703                 trans = btrfs_join_transaction(root);
4704                 if (IS_ERR(trans)) {
4705                         ret = PTR_ERR(trans);
4706                         break;
4707                 }
4708                 if (state == FLUSH_DELAYED_REFS_NR)
4709                         nr = calc_reclaim_items_nr(fs_info, num_bytes);
4710                 else
4711                         nr = 0;
4712                 btrfs_run_delayed_refs(trans, nr);
4713                 btrfs_end_transaction(trans);
4714                 break;
4715         case ALLOC_CHUNK:
4716         case ALLOC_CHUNK_FORCE:
4717                 trans = btrfs_join_transaction(root);
4718                 if (IS_ERR(trans)) {
4719                         ret = PTR_ERR(trans);
4720                         break;
4721                 }
4722                 ret = btrfs_chunk_alloc(trans,
4723                                 btrfs_metadata_alloc_profile(fs_info),
4724                                 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
4725                                         CHUNK_ALLOC_FORCE);
4726                 btrfs_end_transaction(trans);
4727                 if (ret > 0 || ret == -ENOSPC)
4728                         ret = 0;
4729                 break;
4730         case COMMIT_TRANS:
4731                 /*
4732                  * If we have pending delayed iputs then we could free up a
4733                  * bunch of pinned space, so make sure we run the iputs before
4734                  * we do our pinned bytes check below.
4735                  */
4736                 btrfs_run_delayed_iputs(fs_info);
4737                 btrfs_wait_on_delayed_iputs(fs_info);
4738
4739                 ret = may_commit_transaction(fs_info, space_info);
4740                 break;
4741         default:
4742                 ret = -ENOSPC;
4743                 break;
4744         }
4745
4746         trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4747                                 ret);
4748         return;
4749 }
4750
4751 static inline u64
4752 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4753                                  struct btrfs_space_info *space_info,
4754                                  bool system_chunk)
4755 {
4756         struct reserve_ticket *ticket;
4757         u64 used;
4758         u64 expected;
4759         u64 to_reclaim = 0;
4760
4761         list_for_each_entry(ticket, &space_info->tickets, list)
4762                 to_reclaim += ticket->bytes;
4763         list_for_each_entry(ticket, &space_info->priority_tickets, list)
4764                 to_reclaim += ticket->bytes;
4765         if (to_reclaim)
4766                 return to_reclaim;
4767
4768         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4769         if (can_overcommit(fs_info, space_info, to_reclaim,
4770                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4771                 return 0;
4772
4773         used = btrfs_space_info_used(space_info, true);
4774
4775         if (can_overcommit(fs_info, space_info, SZ_1M,
4776                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4777                 expected = div_factor_fine(space_info->total_bytes, 95);
4778         else
4779                 expected = div_factor_fine(space_info->total_bytes, 90);
4780
4781         if (used > expected)
4782                 to_reclaim = used - expected;
4783         else
4784                 to_reclaim = 0;
4785         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4786                                      space_info->bytes_reserved);
4787         return to_reclaim;
4788 }
4789
4790 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4791                                         struct btrfs_space_info *space_info,
4792                                         u64 used, bool system_chunk)
4793 {
4794         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4795
4796         /* If we're just plain full then async reclaim just slows us down. */
4797         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4798                 return 0;
4799
4800         if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4801                                               system_chunk))
4802                 return 0;
4803
4804         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4805                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4806 }
4807
4808 static bool wake_all_tickets(struct list_head *head)
4809 {
4810         struct reserve_ticket *ticket;
4811
4812         while (!list_empty(head)) {
4813                 ticket = list_first_entry(head, struct reserve_ticket, list);
4814                 list_del_init(&ticket->list);
4815                 ticket->error = -ENOSPC;
4816                 wake_up(&ticket->wait);
4817                 if (ticket->bytes != ticket->orig_bytes)
4818                         return true;
4819         }
4820         return false;
4821 }
4822
4823 /*
4824  * This is for normal flushers, we can wait all goddamned day if we want to.  We
4825  * will loop and continuously try to flush as long as we are making progress.
4826  * We count progress as clearing off tickets each time we have to loop.
4827  */
4828 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4829 {
4830         struct btrfs_fs_info *fs_info;
4831         struct btrfs_space_info *space_info;
4832         u64 to_reclaim;
4833         int flush_state;
4834         int commit_cycles = 0;
4835         u64 last_tickets_id;
4836
4837         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4838         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4839
4840         spin_lock(&space_info->lock);
4841         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4842                                                       false);
4843         if (!to_reclaim) {
4844                 space_info->flush = 0;
4845                 spin_unlock(&space_info->lock);
4846                 return;
4847         }
4848         last_tickets_id = space_info->tickets_id;
4849         spin_unlock(&space_info->lock);
4850
4851         flush_state = FLUSH_DELAYED_ITEMS_NR;
4852         do {
4853                 flush_space(fs_info, space_info, to_reclaim, flush_state);
4854                 spin_lock(&space_info->lock);
4855                 if (list_empty(&space_info->tickets)) {
4856                         space_info->flush = 0;
4857                         spin_unlock(&space_info->lock);
4858                         return;
4859                 }
4860                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
4861                                                               space_info,
4862                                                               false);
4863                 if (last_tickets_id == space_info->tickets_id) {
4864                         flush_state++;
4865                 } else {
4866                         last_tickets_id = space_info->tickets_id;
4867                         flush_state = FLUSH_DELAYED_ITEMS_NR;
4868                         if (commit_cycles)
4869                                 commit_cycles--;
4870                 }
4871
4872                 /*
4873                  * We don't want to force a chunk allocation until we've tried
4874                  * pretty hard to reclaim space.  Think of the case where we
4875                  * freed up a bunch of space and so have a lot of pinned space
4876                  * to reclaim.  We would rather use that than possibly create a
4877                  * underutilized metadata chunk.  So if this is our first run
4878                  * through the flushing state machine skip ALLOC_CHUNK_FORCE and
4879                  * commit the transaction.  If nothing has changed the next go
4880                  * around then we can force a chunk allocation.
4881                  */
4882                 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
4883                         flush_state++;
4884
4885                 if (flush_state > COMMIT_TRANS) {
4886                         commit_cycles++;
4887                         if (commit_cycles > 2) {
4888                                 if (wake_all_tickets(&space_info->tickets)) {
4889                                         flush_state = FLUSH_DELAYED_ITEMS_NR;
4890                                         commit_cycles--;
4891                                 } else {
4892                                         space_info->flush = 0;
4893                                 }
4894                         } else {
4895                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
4896                         }
4897                 }
4898                 spin_unlock(&space_info->lock);
4899         } while (flush_state <= COMMIT_TRANS);
4900 }
4901
4902 void btrfs_init_async_reclaim_work(struct work_struct *work)
4903 {
4904         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4905 }
4906
4907 static const enum btrfs_flush_state priority_flush_states[] = {
4908         FLUSH_DELAYED_ITEMS_NR,
4909         FLUSH_DELAYED_ITEMS,
4910         ALLOC_CHUNK,
4911 };
4912
4913 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
4914                                             struct btrfs_space_info *space_info,
4915                                             struct reserve_ticket *ticket)
4916 {
4917         u64 to_reclaim;
4918         int flush_state;
4919
4920         spin_lock(&space_info->lock);
4921         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4922                                                       false);
4923         if (!to_reclaim) {
4924                 spin_unlock(&space_info->lock);
4925                 return;
4926         }
4927         spin_unlock(&space_info->lock);
4928
4929         flush_state = 0;
4930         do {
4931                 flush_space(fs_info, space_info, to_reclaim,
4932                             priority_flush_states[flush_state]);
4933                 flush_state++;
4934                 spin_lock(&space_info->lock);
4935                 if (ticket->bytes == 0) {
4936                         spin_unlock(&space_info->lock);
4937                         return;
4938                 }
4939                 spin_unlock(&space_info->lock);
4940         } while (flush_state < ARRAY_SIZE(priority_flush_states));
4941 }
4942
4943 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
4944                                struct btrfs_space_info *space_info,
4945                                struct reserve_ticket *ticket)
4946
4947 {
4948         DEFINE_WAIT(wait);
4949         u64 reclaim_bytes = 0;
4950         int ret = 0;
4951
4952         spin_lock(&space_info->lock);
4953         while (ticket->bytes > 0 && ticket->error == 0) {
4954                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
4955                 if (ret) {
4956                         ret = -EINTR;
4957                         break;
4958                 }
4959                 spin_unlock(&space_info->lock);
4960
4961                 schedule();
4962
4963                 finish_wait(&ticket->wait, &wait);
4964                 spin_lock(&space_info->lock);
4965         }
4966         if (!ret)
4967                 ret = ticket->error;
4968         if (!list_empty(&ticket->list))
4969                 list_del_init(&ticket->list);
4970         if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
4971                 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
4972         spin_unlock(&space_info->lock);
4973
4974         if (reclaim_bytes)
4975                 btrfs_space_info_add_old_bytes(fs_info, space_info,
4976                                                reclaim_bytes);
4977         return ret;
4978 }
4979
4980 /**
4981  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4982  * @root - the root we're allocating for
4983  * @space_info - the space info we want to allocate from
4984  * @orig_bytes - the number of bytes we want
4985  * @flush - whether or not we can flush to make our reservation
4986  *
4987  * This will reserve orig_bytes number of bytes from the space info associated
4988  * with the block_rsv.  If there is not enough space it will make an attempt to
4989  * flush out space to make room.  It will do this by flushing delalloc if
4990  * possible or committing the transaction.  If flush is 0 then no attempts to
4991  * regain reservations will be made and this will fail if there is not enough
4992  * space already.
4993  */
4994 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
4995                                     struct btrfs_space_info *space_info,
4996                                     u64 orig_bytes,
4997                                     enum btrfs_reserve_flush_enum flush,
4998                                     bool system_chunk)
4999 {
5000         struct reserve_ticket ticket;
5001         u64 used;
5002         u64 reclaim_bytes = 0;
5003         int ret = 0;
5004
5005         ASSERT(orig_bytes);
5006         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5007
5008         spin_lock(&space_info->lock);
5009         ret = -ENOSPC;
5010         used = btrfs_space_info_used(space_info, true);
5011
5012         /*
5013          * If we have enough space then hooray, make our reservation and carry
5014          * on.  If not see if we can overcommit, and if we can, hooray carry on.
5015          * If not things get more complicated.
5016          */
5017         if (used + orig_bytes <= space_info->total_bytes) {
5018                 update_bytes_may_use(fs_info, space_info, orig_bytes);
5019                 trace_btrfs_space_reservation(fs_info, "space_info",
5020                                               space_info->flags, orig_bytes, 1);
5021                 ret = 0;
5022         } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5023                                   system_chunk)) {
5024                 update_bytes_may_use(fs_info, space_info, orig_bytes);
5025                 trace_btrfs_space_reservation(fs_info, "space_info",
5026                                               space_info->flags, orig_bytes, 1);
5027                 ret = 0;
5028         }
5029
5030         /*
5031          * If we couldn't make a reservation then setup our reservation ticket
5032          * and kick the async worker if it's not already running.
5033          *
5034          * If we are a priority flusher then we just need to add our ticket to
5035          * the list and we will do our own flushing further down.
5036          */
5037         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5038                 ticket.orig_bytes = orig_bytes;
5039                 ticket.bytes = orig_bytes;
5040                 ticket.error = 0;
5041                 init_waitqueue_head(&ticket.wait);
5042                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5043                         list_add_tail(&ticket.list, &space_info->tickets);
5044                         if (!space_info->flush) {
5045                                 space_info->flush = 1;
5046                                 trace_btrfs_trigger_flush(fs_info,
5047                                                           space_info->flags,
5048                                                           orig_bytes, flush,
5049                                                           "enospc");
5050                                 queue_work(system_unbound_wq,
5051                                            &fs_info->async_reclaim_work);
5052                         }
5053                 } else {
5054                         list_add_tail(&ticket.list,
5055                                       &space_info->priority_tickets);
5056                 }
5057         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5058                 used += orig_bytes;
5059                 /*
5060                  * We will do the space reservation dance during log replay,
5061                  * which means we won't have fs_info->fs_root set, so don't do
5062                  * the async reclaim as we will panic.
5063                  */
5064                 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5065                     need_do_async_reclaim(fs_info, space_info,
5066                                           used, system_chunk) &&
5067                     !work_busy(&fs_info->async_reclaim_work)) {
5068                         trace_btrfs_trigger_flush(fs_info, space_info->flags,
5069                                                   orig_bytes, flush, "preempt");
5070                         queue_work(system_unbound_wq,
5071                                    &fs_info->async_reclaim_work);
5072                 }
5073         }
5074         spin_unlock(&space_info->lock);
5075         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5076                 return ret;
5077
5078         if (flush == BTRFS_RESERVE_FLUSH_ALL)
5079                 return wait_reserve_ticket(fs_info, space_info, &ticket);
5080
5081         ret = 0;
5082         priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5083         spin_lock(&space_info->lock);
5084         if (ticket.bytes) {
5085                 if (ticket.bytes < orig_bytes)
5086                         reclaim_bytes = orig_bytes - ticket.bytes;
5087                 list_del_init(&ticket.list);
5088                 ret = -ENOSPC;
5089         }
5090         spin_unlock(&space_info->lock);
5091
5092         if (reclaim_bytes)
5093                 btrfs_space_info_add_old_bytes(fs_info, space_info,
5094                                                reclaim_bytes);
5095         ASSERT(list_empty(&ticket.list));
5096         return ret;
5097 }
5098
5099 /**
5100  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5101  * @root - the root we're allocating for
5102  * @block_rsv - the block_rsv we're allocating for
5103  * @orig_bytes - the number of bytes we want
5104  * @flush - whether or not we can flush to make our reservation
5105  *
5106  * This will reserve orig_bytes number of bytes from the space info associated
5107  * with the block_rsv.  If there is not enough space it will make an attempt to
5108  * flush out space to make room.  It will do this by flushing delalloc if
5109  * possible or committing the transaction.  If flush is 0 then no attempts to
5110  * regain reservations will be made and this will fail if there is not enough
5111  * space already.
5112  */
5113 static int reserve_metadata_bytes(struct btrfs_root *root,
5114                                   struct btrfs_block_rsv *block_rsv,
5115                                   u64 orig_bytes,
5116                                   enum btrfs_reserve_flush_enum flush)
5117 {
5118         struct btrfs_fs_info *fs_info = root->fs_info;
5119         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5120         int ret;
5121         bool system_chunk = (root == fs_info->chunk_root);
5122
5123         ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5124                                        orig_bytes, flush, system_chunk);
5125         if (ret == -ENOSPC &&
5126             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5127                 if (block_rsv != global_rsv &&
5128                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5129                         ret = 0;
5130         }
5131         if (ret == -ENOSPC) {
5132                 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5133                                               block_rsv->space_info->flags,
5134                                               orig_bytes, 1);
5135
5136                 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5137                         dump_space_info(fs_info, block_rsv->space_info,
5138                                         orig_bytes, 0);
5139         }
5140         return ret;
5141 }
5142
5143 static struct btrfs_block_rsv *get_block_rsv(
5144                                         const struct btrfs_trans_handle *trans,
5145                                         const struct btrfs_root *root)
5146 {
5147         struct btrfs_fs_info *fs_info = root->fs_info;
5148         struct btrfs_block_rsv *block_rsv = NULL;
5149
5150         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5151             (root == fs_info->csum_root && trans->adding_csums) ||
5152             (root == fs_info->uuid_root))
5153                 block_rsv = trans->block_rsv;
5154
5155         if (!block_rsv)
5156                 block_rsv = root->block_rsv;
5157
5158         if (!block_rsv)
5159                 block_rsv = &fs_info->empty_block_rsv;
5160
5161         return block_rsv;
5162 }
5163
5164 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5165                                u64 num_bytes)
5166 {
5167         int ret = -ENOSPC;
5168         spin_lock(&block_rsv->lock);
5169         if (block_rsv->reserved >= num_bytes) {
5170                 block_rsv->reserved -= num_bytes;
5171                 if (block_rsv->reserved < block_rsv->size)
5172                         block_rsv->full = 0;
5173                 ret = 0;
5174         }
5175         spin_unlock(&block_rsv->lock);
5176         return ret;
5177 }
5178
5179 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5180                                 u64 num_bytes, bool update_size)
5181 {
5182         spin_lock(&block_rsv->lock);
5183         block_rsv->reserved += num_bytes;
5184         if (update_size)
5185                 block_rsv->size += num_bytes;
5186         else if (block_rsv->reserved >= block_rsv->size)
5187                 block_rsv->full = 1;
5188         spin_unlock(&block_rsv->lock);
5189 }
5190
5191 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5192                              struct btrfs_block_rsv *dest, u64 num_bytes,
5193                              int min_factor)
5194 {
5195         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5196         u64 min_bytes;
5197
5198         if (global_rsv->space_info != dest->space_info)
5199                 return -ENOSPC;
5200
5201         spin_lock(&global_rsv->lock);
5202         min_bytes = div_factor(global_rsv->size, min_factor);
5203         if (global_rsv->reserved < min_bytes + num_bytes) {
5204                 spin_unlock(&global_rsv->lock);
5205                 return -ENOSPC;
5206         }
5207         global_rsv->reserved -= num_bytes;
5208         if (global_rsv->reserved < global_rsv->size)
5209                 global_rsv->full = 0;
5210         spin_unlock(&global_rsv->lock);
5211
5212         block_rsv_add_bytes(dest, num_bytes, true);
5213         return 0;
5214 }
5215
5216 /**
5217  * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
5218  * @fs_info - the fs info for our fs.
5219  * @src - the source block rsv to transfer from.
5220  * @num_bytes - the number of bytes to transfer.
5221  *
5222  * This transfers up to the num_bytes amount from the src rsv to the
5223  * delayed_refs_rsv.  Any extra bytes are returned to the space info.
5224  */
5225 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
5226                                        struct btrfs_block_rsv *src,
5227                                        u64 num_bytes)
5228 {
5229         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5230         u64 to_free = 0;
5231
5232         spin_lock(&src->lock);
5233         src->reserved -= num_bytes;
5234         src->size -= num_bytes;
5235         spin_unlock(&src->lock);
5236
5237         spin_lock(&delayed_refs_rsv->lock);
5238         if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
5239                 u64 delta = delayed_refs_rsv->size -
5240                         delayed_refs_rsv->reserved;
5241                 if (num_bytes > delta) {
5242                         to_free = num_bytes - delta;
5243                         num_bytes = delta;
5244                 }
5245         } else {
5246                 to_free = num_bytes;
5247                 num_bytes = 0;
5248         }
5249
5250         if (num_bytes)
5251                 delayed_refs_rsv->reserved += num_bytes;
5252         if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
5253                 delayed_refs_rsv->full = 1;
5254         spin_unlock(&delayed_refs_rsv->lock);
5255
5256         if (num_bytes)
5257                 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5258                                               0, num_bytes, 1);
5259         if (to_free)
5260                 btrfs_space_info_add_old_bytes(fs_info,
5261                                 delayed_refs_rsv->space_info, to_free);
5262 }
5263
5264 /**
5265  * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
5266  * @fs_info - the fs_info for our fs.
5267  * @flush - control how we can flush for this reservation.
5268  *
5269  * This will refill the delayed block_rsv up to 1 items size worth of space and
5270  * will return -ENOSPC if we can't make the reservation.
5271  */
5272 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
5273                                   enum btrfs_reserve_flush_enum flush)
5274 {
5275         struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5276         u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
5277         u64 num_bytes = 0;
5278         int ret = -ENOSPC;
5279
5280         spin_lock(&block_rsv->lock);
5281         if (block_rsv->reserved < block_rsv->size) {
5282                 num_bytes = block_rsv->size - block_rsv->reserved;
5283                 num_bytes = min(num_bytes, limit);
5284         }
5285         spin_unlock(&block_rsv->lock);
5286
5287         if (!num_bytes)
5288                 return 0;
5289
5290         ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
5291                                      num_bytes, flush);
5292         if (ret)
5293                 return ret;
5294         block_rsv_add_bytes(block_rsv, num_bytes, 0);
5295         trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5296                                       0, num_bytes, 1);
5297         return 0;
5298 }
5299
5300 /*
5301  * This is for space we already have accounted in space_info->bytes_may_use, so
5302  * basically when we're returning space from block_rsv's.
5303  */
5304 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5305                                     struct btrfs_space_info *space_info,
5306                                     u64 num_bytes)
5307 {
5308         struct reserve_ticket *ticket;
5309         struct list_head *head;
5310         u64 used;
5311         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5312         bool check_overcommit = false;
5313
5314         spin_lock(&space_info->lock);
5315         head = &space_info->priority_tickets;
5316
5317         /*
5318          * If we are over our limit then we need to check and see if we can
5319          * overcommit, and if we can't then we just need to free up our space
5320          * and not satisfy any requests.
5321          */
5322         used = btrfs_space_info_used(space_info, true);
5323         if (used - num_bytes >= space_info->total_bytes)
5324                 check_overcommit = true;
5325 again:
5326         while (!list_empty(head) && num_bytes) {
5327                 ticket = list_first_entry(head, struct reserve_ticket,
5328                                           list);
5329                 /*
5330                  * We use 0 bytes because this space is already reserved, so
5331                  * adding the ticket space would be a double count.
5332                  */
5333                 if (check_overcommit &&
5334                     !can_overcommit(fs_info, space_info, 0, flush, false))
5335                         break;
5336                 if (num_bytes >= ticket->bytes) {
5337                         list_del_init(&ticket->list);
5338                         num_bytes -= ticket->bytes;
5339                         ticket->bytes = 0;
5340                         space_info->tickets_id++;
5341                         wake_up(&ticket->wait);
5342                 } else {
5343                         ticket->bytes -= num_bytes;
5344                         num_bytes = 0;
5345                 }
5346         }
5347
5348         if (num_bytes && head == &space_info->priority_tickets) {
5349                 head = &space_info->tickets;
5350                 flush = BTRFS_RESERVE_FLUSH_ALL;
5351                 goto again;
5352         }
5353         update_bytes_may_use(fs_info, space_info, -num_bytes);
5354         trace_btrfs_space_reservation(fs_info, "space_info",
5355                                       space_info->flags, num_bytes, 0);
5356         spin_unlock(&space_info->lock);
5357 }
5358
5359 /*
5360  * This is for newly allocated space that isn't accounted in
5361  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5362  * we use this helper.
5363  */
5364 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5365                                     struct btrfs_space_info *space_info,
5366                                     u64 num_bytes)
5367 {
5368         struct reserve_ticket *ticket;
5369         struct list_head *head = &space_info->priority_tickets;
5370
5371 again:
5372         while (!list_empty(head) && num_bytes) {
5373                 ticket = list_first_entry(head, struct reserve_ticket,
5374                                           list);
5375                 if (num_bytes >= ticket->bytes) {
5376                         trace_btrfs_space_reservation(fs_info, "space_info",
5377                                                       space_info->flags,
5378                                                       ticket->bytes, 1);
5379                         list_del_init(&ticket->list);
5380                         num_bytes -= ticket->bytes;
5381                         update_bytes_may_use(fs_info, space_info,
5382                                              ticket->bytes);
5383                         ticket->bytes = 0;
5384                         space_info->tickets_id++;
5385                         wake_up(&ticket->wait);
5386                 } else {
5387                         trace_btrfs_space_reservation(fs_info, "space_info",
5388                                                       space_info->flags,
5389                                                       num_bytes, 1);
5390                         update_bytes_may_use(fs_info, space_info, num_bytes);
5391                         ticket->bytes -= num_bytes;
5392                         num_bytes = 0;
5393                 }
5394         }
5395
5396         if (num_bytes && head == &space_info->priority_tickets) {
5397                 head = &space_info->tickets;
5398                 goto again;
5399         }
5400 }
5401
5402 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5403                                     struct btrfs_block_rsv *block_rsv,
5404                                     struct btrfs_block_rsv *dest, u64 num_bytes,
5405                                     u64 *qgroup_to_release_ret)
5406 {
5407         struct btrfs_space_info *space_info = block_rsv->space_info;
5408         u64 qgroup_to_release = 0;
5409         u64 ret;
5410
5411         spin_lock(&block_rsv->lock);
5412         if (num_bytes == (u64)-1) {
5413                 num_bytes = block_rsv->size;
5414                 qgroup_to_release = block_rsv->qgroup_rsv_size;
5415         }
5416         block_rsv->size -= num_bytes;
5417         if (block_rsv->reserved >= block_rsv->size) {
5418                 num_bytes = block_rsv->reserved - block_rsv->size;
5419                 block_rsv->reserved = block_rsv->size;
5420                 block_rsv->full = 1;
5421         } else {
5422                 num_bytes = 0;
5423         }
5424         if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5425                 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5426                                     block_rsv->qgroup_rsv_size;
5427                 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5428         } else {
5429                 qgroup_to_release = 0;
5430         }
5431         spin_unlock(&block_rsv->lock);
5432
5433         ret = num_bytes;
5434         if (num_bytes > 0) {
5435                 if (dest) {
5436                         spin_lock(&dest->lock);
5437                         if (!dest->full) {
5438                                 u64 bytes_to_add;
5439
5440                                 bytes_to_add = dest->size - dest->reserved;
5441                                 bytes_to_add = min(num_bytes, bytes_to_add);
5442                                 dest->reserved += bytes_to_add;
5443                                 if (dest->reserved >= dest->size)
5444                                         dest->full = 1;
5445                                 num_bytes -= bytes_to_add;
5446                         }
5447                         spin_unlock(&dest->lock);
5448                 }
5449                 if (num_bytes)
5450                         btrfs_space_info_add_old_bytes(fs_info, space_info,
5451                                                        num_bytes);
5452         }
5453         if (qgroup_to_release_ret)
5454                 *qgroup_to_release_ret = qgroup_to_release;
5455         return ret;
5456 }
5457
5458 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5459                             struct btrfs_block_rsv *dst, u64 num_bytes,
5460                             bool update_size)
5461 {
5462         int ret;
5463
5464         ret = block_rsv_use_bytes(src, num_bytes);
5465         if (ret)
5466                 return ret;
5467
5468         block_rsv_add_bytes(dst, num_bytes, update_size);
5469         return 0;
5470 }
5471
5472 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5473 {
5474         memset(rsv, 0, sizeof(*rsv));
5475         spin_lock_init(&rsv->lock);
5476         rsv->type = type;
5477 }
5478
5479 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5480                                    struct btrfs_block_rsv *rsv,
5481                                    unsigned short type)
5482 {
5483         btrfs_init_block_rsv(rsv, type);
5484         rsv->space_info = btrfs_find_space_info(fs_info,
5485                                             BTRFS_BLOCK_GROUP_METADATA);
5486 }
5487
5488 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5489                                               unsigned short type)
5490 {
5491         struct btrfs_block_rsv *block_rsv;
5492
5493         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5494         if (!block_rsv)
5495                 return NULL;
5496
5497         btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5498         return block_rsv;
5499 }
5500
5501 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5502                           struct btrfs_block_rsv *rsv)
5503 {
5504         if (!rsv)
5505                 return;
5506         btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5507         kfree(rsv);
5508 }
5509
5510 int btrfs_block_rsv_add(struct btrfs_root *root,
5511                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5512                         enum btrfs_reserve_flush_enum flush)
5513 {
5514         int ret;
5515
5516         if (num_bytes == 0)
5517                 return 0;
5518
5519         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5520         if (!ret)
5521                 block_rsv_add_bytes(block_rsv, num_bytes, true);
5522
5523         return ret;
5524 }
5525
5526 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5527 {
5528         u64 num_bytes = 0;
5529         int ret = -ENOSPC;
5530
5531         if (!block_rsv)
5532                 return 0;
5533
5534         spin_lock(&block_rsv->lock);
5535         num_bytes = div_factor(block_rsv->size, min_factor);
5536         if (block_rsv->reserved >= num_bytes)
5537                 ret = 0;
5538         spin_unlock(&block_rsv->lock);
5539
5540         return ret;
5541 }
5542
5543 int btrfs_block_rsv_refill(struct btrfs_root *root,
5544                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5545                            enum btrfs_reserve_flush_enum flush)
5546 {
5547         u64 num_bytes = 0;
5548         int ret = -ENOSPC;
5549
5550         if (!block_rsv)
5551                 return 0;
5552
5553         spin_lock(&block_rsv->lock);
5554         num_bytes = min_reserved;
5555         if (block_rsv->reserved >= num_bytes)
5556                 ret = 0;
5557         else
5558                 num_bytes -= block_rsv->reserved;
5559         spin_unlock(&block_rsv->lock);
5560
5561         if (!ret)
5562                 return 0;
5563
5564         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5565         if (!ret) {
5566                 block_rsv_add_bytes(block_rsv, num_bytes, false);
5567                 return 0;
5568         }
5569
5570         return ret;
5571 }
5572
5573 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5574                                      struct btrfs_block_rsv *block_rsv,
5575                                      u64 num_bytes, u64 *qgroup_to_release)
5576 {
5577         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5578         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5579         struct btrfs_block_rsv *target = delayed_rsv;
5580
5581         if (target->full || target == block_rsv)
5582                 target = global_rsv;
5583
5584         if (block_rsv->space_info != target->space_info)
5585                 target = NULL;
5586
5587         return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
5588                                        qgroup_to_release);
5589 }
5590
5591 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5592                              struct btrfs_block_rsv *block_rsv,
5593                              u64 num_bytes)
5594 {
5595         __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
5596 }
5597
5598 /**
5599  * btrfs_inode_rsv_release - release any excessive reservation.
5600  * @inode - the inode we need to release from.
5601  * @qgroup_free - free or convert qgroup meta.
5602  *   Unlike normal operation, qgroup meta reservation needs to know if we are
5603  *   freeing qgroup reservation or just converting it into per-trans.  Normally
5604  *   @qgroup_free is true for error handling, and false for normal release.
5605  *
5606  * This is the same as btrfs_block_rsv_release, except that it handles the
5607  * tracepoint for the reservation.
5608  */
5609 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5610 {
5611         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5612         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5613         u64 released = 0;
5614         u64 qgroup_to_release = 0;
5615
5616         /*
5617          * Since we statically set the block_rsv->size we just want to say we
5618          * are releasing 0 bytes, and then we'll just get the reservation over
5619          * the size free'd.
5620          */
5621         released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
5622                                              &qgroup_to_release);
5623         if (released > 0)
5624                 trace_btrfs_space_reservation(fs_info, "delalloc",
5625                                               btrfs_ino(inode), released, 0);
5626         if (qgroup_free)
5627                 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5628         else
5629                 btrfs_qgroup_convert_reserved_meta(inode->root,
5630                                                    qgroup_to_release);
5631 }
5632
5633 /**
5634  * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
5635  * @fs_info - the fs_info for our fs.
5636  * @nr - the number of items to drop.
5637  *
5638  * This drops the delayed ref head's count from the delayed refs rsv and frees
5639  * any excess reservation we had.
5640  */
5641 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
5642 {
5643         struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5644         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5645         u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
5646         u64 released = 0;
5647
5648         released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
5649                                            num_bytes, NULL);
5650         if (released)
5651                 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5652                                               0, released, 0);
5653 }
5654
5655 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5656 {
5657         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5658         struct btrfs_space_info *sinfo = block_rsv->space_info;
5659         u64 num_bytes;
5660
5661         /*
5662          * The global block rsv is based on the size of the extent tree, the
5663          * checksum tree and the root tree.  If the fs is empty we want to set
5664          * it to a minimal amount for safety.
5665          */
5666         num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5667                 btrfs_root_used(&fs_info->csum_root->root_item) +
5668                 btrfs_root_used(&fs_info->tree_root->root_item);
5669         num_bytes = max_t(u64, num_bytes, SZ_16M);
5670
5671         spin_lock(&sinfo->lock);
5672         spin_lock(&block_rsv->lock);
5673
5674         block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5675
5676         if (block_rsv->reserved < block_rsv->size) {
5677                 num_bytes = btrfs_space_info_used(sinfo, true);
5678                 if (sinfo->total_bytes > num_bytes) {
5679                         num_bytes = sinfo->total_bytes - num_bytes;
5680                         num_bytes = min(num_bytes,
5681                                         block_rsv->size - block_rsv->reserved);
5682                         block_rsv->reserved += num_bytes;
5683                         update_bytes_may_use(fs_info, sinfo, num_bytes);
5684                         trace_btrfs_space_reservation(fs_info, "space_info",
5685                                                       sinfo->flags, num_bytes,
5686                                                       1);
5687                 }
5688         } else if (block_rsv->reserved > block_rsv->size) {
5689                 num_bytes = block_rsv->reserved - block_rsv->size;
5690                 update_bytes_may_use(fs_info, sinfo, -num_bytes);
5691                 trace_btrfs_space_reservation(fs_info, "space_info",
5692                                       sinfo->flags, num_bytes, 0);
5693                 block_rsv->reserved = block_rsv->size;
5694         }
5695
5696         if (block_rsv->reserved == block_rsv->size)
5697                 block_rsv->full = 1;
5698         else
5699                 block_rsv->full = 0;
5700
5701         spin_unlock(&block_rsv->lock);
5702         spin_unlock(&sinfo->lock);
5703 }
5704
5705 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5706 {
5707         struct btrfs_space_info *space_info;
5708
5709         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5710         fs_info->chunk_block_rsv.space_info = space_info;
5711
5712         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5713         fs_info->global_block_rsv.space_info = space_info;
5714         fs_info->trans_block_rsv.space_info = space_info;
5715         fs_info->empty_block_rsv.space_info = space_info;
5716         fs_info->delayed_block_rsv.space_info = space_info;
5717         fs_info->delayed_refs_rsv.space_info = space_info;
5718
5719         fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
5720         fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
5721         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5722         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5723         if (fs_info->quota_root)
5724                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5725         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5726
5727         update_global_block_rsv(fs_info);
5728 }
5729
5730 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5731 {
5732         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5733                                 (u64)-1, NULL);
5734         WARN_ON(fs_info->trans_block_rsv.size > 0);
5735         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5736         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5737         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5738         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5739         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5740         WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
5741         WARN_ON(fs_info->delayed_refs_rsv.size > 0);
5742 }
5743
5744 /*
5745  * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
5746  * @trans - the trans that may have generated delayed refs
5747  *
5748  * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
5749  * it'll calculate the additional size and add it to the delayed_refs_rsv.
5750  */
5751 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
5752 {
5753         struct btrfs_fs_info *fs_info = trans->fs_info;
5754         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5755         u64 num_bytes;
5756
5757         if (!trans->delayed_ref_updates)
5758                 return;
5759
5760         num_bytes = btrfs_calc_trans_metadata_size(fs_info,
5761                                                    trans->delayed_ref_updates);
5762         spin_lock(&delayed_rsv->lock);
5763         delayed_rsv->size += num_bytes;
5764         delayed_rsv->full = 0;
5765         spin_unlock(&delayed_rsv->lock);
5766         trans->delayed_ref_updates = 0;
5767 }
5768
5769 /*
5770  * To be called after all the new block groups attached to the transaction
5771  * handle have been created (btrfs_create_pending_block_groups()).
5772  */
5773 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5774 {
5775         struct btrfs_fs_info *fs_info = trans->fs_info;
5776
5777         if (!trans->chunk_bytes_reserved)
5778                 return;
5779
5780         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5781
5782         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5783                                 trans->chunk_bytes_reserved, NULL);
5784         trans->chunk_bytes_reserved = 0;
5785 }
5786
5787 /*
5788  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5789  * root: the root of the parent directory
5790  * rsv: block reservation
5791  * items: the number of items that we need do reservation
5792  * use_global_rsv: allow fallback to the global block reservation
5793  *
5794  * This function is used to reserve the space for snapshot/subvolume
5795  * creation and deletion. Those operations are different with the
5796  * common file/directory operations, they change two fs/file trees
5797  * and root tree, the number of items that the qgroup reserves is
5798  * different with the free space reservation. So we can not use
5799  * the space reservation mechanism in start_transaction().
5800  */
5801 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5802                                      struct btrfs_block_rsv *rsv, int items,
5803                                      bool use_global_rsv)
5804 {
5805         u64 qgroup_num_bytes = 0;
5806         u64 num_bytes;
5807         int ret;
5808         struct btrfs_fs_info *fs_info = root->fs_info;
5809         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5810
5811         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5812                 /* One for parent inode, two for dir entries */
5813                 qgroup_num_bytes = 3 * fs_info->nodesize;
5814                 ret = btrfs_qgroup_reserve_meta_prealloc(root,
5815                                 qgroup_num_bytes, true);
5816                 if (ret)
5817                         return ret;
5818         }
5819
5820         num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5821         rsv->space_info = btrfs_find_space_info(fs_info,
5822                                             BTRFS_BLOCK_GROUP_METADATA);
5823         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5824                                   BTRFS_RESERVE_FLUSH_ALL);
5825
5826         if (ret == -ENOSPC && use_global_rsv)
5827                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
5828
5829         if (ret && qgroup_num_bytes)
5830                 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5831
5832         return ret;
5833 }
5834
5835 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5836                                       struct btrfs_block_rsv *rsv)
5837 {
5838         btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5839 }
5840
5841 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5842                                                  struct btrfs_inode *inode)
5843 {
5844         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5845         u64 reserve_size = 0;
5846         u64 qgroup_rsv_size = 0;
5847         u64 csum_leaves;
5848         unsigned outstanding_extents;
5849
5850         lockdep_assert_held(&inode->lock);
5851         outstanding_extents = inode->outstanding_extents;
5852         if (outstanding_extents)
5853                 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
5854                                                 outstanding_extents + 1);
5855         csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
5856                                                  inode->csum_bytes);
5857         reserve_size += btrfs_calc_trans_metadata_size(fs_info,
5858                                                        csum_leaves);
5859         /*
5860          * For qgroup rsv, the calculation is very simple:
5861          * account one nodesize for each outstanding extent
5862          *
5863          * This is overestimating in most cases.
5864          */
5865         qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
5866
5867         spin_lock(&block_rsv->lock);
5868         block_rsv->size = reserve_size;
5869         block_rsv->qgroup_rsv_size = qgroup_rsv_size;
5870         spin_unlock(&block_rsv->lock);
5871 }
5872
5873 static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
5874                                     u64 num_bytes, u64 *meta_reserve,
5875                                     u64 *qgroup_reserve)
5876 {
5877         u64 nr_extents = count_max_extents(num_bytes);
5878         u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
5879
5880         /* We add one for the inode update at finish ordered time */
5881         *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
5882                                                 nr_extents + csum_leaves + 1);
5883         *qgroup_reserve = nr_extents * fs_info->nodesize;
5884 }
5885
5886 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
5887 {
5888         struct btrfs_root *root = inode->root;
5889         struct btrfs_fs_info *fs_info = root->fs_info;
5890         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5891         u64 meta_reserve, qgroup_reserve;
5892         unsigned nr_extents;
5893         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5894         int ret = 0;
5895         bool delalloc_lock = true;
5896
5897         /* If we are a free space inode we need to not flush since we will be in
5898          * the middle of a transaction commit.  We also don't need the delalloc
5899          * mutex since we won't race with anybody.  We need this mostly to make
5900          * lockdep shut its filthy mouth.
5901          *
5902          * If we have a transaction open (can happen if we call truncate_block
5903          * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5904          */
5905         if (btrfs_is_free_space_inode(inode)) {
5906                 flush = BTRFS_RESERVE_NO_FLUSH;
5907                 delalloc_lock = false;
5908         } else {
5909                 if (current->journal_info)
5910                         flush = BTRFS_RESERVE_FLUSH_LIMIT;
5911
5912                 if (btrfs_transaction_in_commit(fs_info))
5913                         schedule_timeout(1);
5914         }
5915
5916         if (delalloc_lock)
5917                 mutex_lock(&inode->delalloc_mutex);
5918
5919         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5920
5921         /*
5922          * We always want to do it this way, every other way is wrong and ends
5923          * in tears.  Pre-reserving the amount we are going to add will always
5924          * be the right way, because otherwise if we have enough parallelism we
5925          * could end up with thousands of inodes all holding little bits of
5926          * reservations they were able to make previously and the only way to
5927          * reclaim that space is to ENOSPC out the operations and clear
5928          * everything out and try again, which is bad.  This way we just
5929          * over-reserve slightly, and clean up the mess when we are done.
5930          */
5931         calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
5932                                 &qgroup_reserve);
5933         ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
5934         if (ret)
5935                 goto out_fail;
5936         ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
5937         if (ret)
5938                 goto out_qgroup;
5939
5940         /*
5941          * Now we need to update our outstanding extents and csum bytes _first_
5942          * and then add the reservation to the block_rsv.  This keeps us from
5943          * racing with an ordered completion or some such that would think it
5944          * needs to free the reservation we just made.
5945          */
5946         spin_lock(&inode->lock);
5947         nr_extents = count_max_extents(num_bytes);
5948         btrfs_mod_outstanding_extents(inode, nr_extents);
5949         inode->csum_bytes += num_bytes;
5950         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5951         spin_unlock(&inode->lock);
5952
5953         /* Now we can safely add our space to our block rsv */
5954         block_rsv_add_bytes(block_rsv, meta_reserve, false);
5955         trace_btrfs_space_reservation(root->fs_info, "delalloc",
5956                                       btrfs_ino(inode), meta_reserve, 1);
5957
5958         spin_lock(&block_rsv->lock);
5959         block_rsv->qgroup_rsv_reserved += qgroup_reserve;
5960         spin_unlock(&block_rsv->lock);
5961
5962         if (delalloc_lock)
5963                 mutex_unlock(&inode->delalloc_mutex);
5964         return 0;
5965 out_qgroup:
5966         btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
5967 out_fail:
5968         btrfs_inode_rsv_release(inode, true);
5969         if (delalloc_lock)
5970                 mutex_unlock(&inode->delalloc_mutex);
5971         return ret;
5972 }
5973
5974 /**
5975  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5976  * @inode: the inode to release the reservation for.
5977  * @num_bytes: the number of bytes we are releasing.
5978  * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
5979  *
5980  * This will release the metadata reservation for an inode.  This can be called
5981  * once we complete IO for a given set of bytes to release their metadata
5982  * reservations, or on error for the same reason.
5983  */
5984 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
5985                                      bool qgroup_free)
5986 {
5987         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5988
5989         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5990         spin_lock(&inode->lock);
5991         inode->csum_bytes -= num_bytes;
5992         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5993         spin_unlock(&inode->lock);
5994
5995         if (btrfs_is_testing(fs_info))
5996                 return;
5997
5998         btrfs_inode_rsv_release(inode, qgroup_free);
5999 }
6000
6001 /**
6002  * btrfs_delalloc_release_extents - release our outstanding_extents
6003  * @inode: the inode to balance the reservation for.
6004  * @num_bytes: the number of bytes we originally reserved with
6005  * @qgroup_free: do we need to free qgroup meta reservation or convert them.
6006  *
6007  * When we reserve space we increase outstanding_extents for the extents we may
6008  * add.  Once we've set the range as delalloc or created our ordered extents we
6009  * have outstanding_extents to track the real usage, so we use this to free our
6010  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
6011  * with btrfs_delalloc_reserve_metadata.
6012  */
6013 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
6014                                     bool qgroup_free)
6015 {
6016         struct btrfs_fs_info *fs_info = inode->root->fs_info;
6017         unsigned num_extents;
6018
6019         spin_lock(&inode->lock);
6020         num_extents = count_max_extents(num_bytes);
6021         btrfs_mod_outstanding_extents(inode, -num_extents);
6022         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6023         spin_unlock(&inode->lock);
6024
6025         if (btrfs_is_testing(fs_info))
6026                 return;
6027
6028         btrfs_inode_rsv_release(inode, qgroup_free);
6029 }
6030
6031 /**
6032  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6033  * delalloc
6034  * @inode: inode we're writing to
6035  * @start: start range we are writing to
6036  * @len: how long the range we are writing to
6037  * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6038  *            current reservation.
6039  *
6040  * This will do the following things
6041  *
6042  * o reserve space in data space info for num bytes
6043  *   and reserve precious corresponding qgroup space
6044  *   (Done in check_data_free_space)
6045  *
6046  * o reserve space for metadata space, based on the number of outstanding
6047  *   extents and how much csums will be needed
6048  *   also reserve metadata space in a per root over-reserve method.
6049  * o add to the inodes->delalloc_bytes
6050  * o add it to the fs_info's delalloc inodes list.
6051  *   (Above 3 all done in delalloc_reserve_metadata)
6052  *
6053  * Return 0 for success
6054  * Return <0 for error(-ENOSPC or -EQUOT)
6055  */
6056 int btrfs_delalloc_reserve_space(struct inode *inode,
6057                         struct extent_changeset **reserved, u64 start, u64 len)
6058 {
6059         int ret;
6060
6061         ret = btrfs_check_data_free_space(inode, reserved, start, len);
6062         if (ret < 0)
6063                 return ret;
6064         ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6065         if (ret < 0)
6066                 btrfs_free_reserved_data_space(inode, *reserved, start, len);
6067         return ret;
6068 }
6069
6070 /**
6071  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6072  * @inode: inode we're releasing space for
6073  * @start: start position of the space already reserved
6074  * @len: the len of the space already reserved
6075  * @release_bytes: the len of the space we consumed or didn't use
6076  *
6077  * This function will release the metadata space that was not used and will
6078  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6079  * list if there are no delalloc bytes left.
6080  * Also it will handle the qgroup reserved space.
6081  */
6082 void btrfs_delalloc_release_space(struct inode *inode,
6083                                   struct extent_changeset *reserved,
6084                                   u64 start, u64 len, bool qgroup_free)
6085 {
6086         btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6087         btrfs_free_reserved_data_space(inode, reserved, start, len);
6088 }
6089
6090 static int update_block_group(struct btrfs_trans_handle *trans,
6091                               u64 bytenr, u64 num_bytes, int alloc)
6092 {
6093         struct btrfs_fs_info *info = trans->fs_info;
6094         struct btrfs_block_group_cache *cache = NULL;
6095         u64 total = num_bytes;
6096         u64 old_val;
6097         u64 byte_in_group;
6098         int factor;
6099         int ret = 0;
6100
6101         /* block accounting for super block */
6102         spin_lock(&info->delalloc_root_lock);
6103         old_val = btrfs_super_bytes_used(info->super_copy);
6104         if (alloc)
6105                 old_val += num_bytes;
6106         else
6107                 old_val -= num_bytes;
6108         btrfs_set_super_bytes_used(info->super_copy, old_val);
6109         spin_unlock(&info->delalloc_root_lock);
6110
6111         while (total) {
6112                 cache = btrfs_lookup_block_group(info, bytenr);
6113                 if (!cache) {
6114                         ret = -ENOENT;
6115                         break;
6116                 }
6117                 factor = btrfs_bg_type_to_factor(cache->flags);
6118
6119                 /*
6120                  * If this block group has free space cache written out, we
6121                  * need to make sure to load it if we are removing space.  This
6122                  * is because we need the unpinning stage to actually add the
6123                  * space back to the block group, otherwise we will leak space.
6124                  */
6125                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
6126                         cache_block_group(cache, 1);
6127
6128                 byte_in_group = bytenr - cache->key.objectid;
6129                 WARN_ON(byte_in_group > cache->key.offset);
6130
6131                 spin_lock(&cache->space_info->lock);
6132                 spin_lock(&cache->lock);
6133
6134                 if (btrfs_test_opt(info, SPACE_CACHE) &&
6135                     cache->disk_cache_state < BTRFS_DC_CLEAR)
6136                         cache->disk_cache_state = BTRFS_DC_CLEAR;
6137
6138                 old_val = btrfs_block_group_used(&cache->item);
6139                 num_bytes = min(total, cache->key.offset - byte_in_group);
6140                 if (alloc) {
6141                         old_val += num_bytes;
6142                         btrfs_set_block_group_used(&cache->item, old_val);
6143                         cache->reserved -= num_bytes;
6144                         cache->space_info->bytes_reserved -= num_bytes;
6145                         cache->space_info->bytes_used += num_bytes;
6146                         cache->space_info->disk_used += num_bytes * factor;
6147                         spin_unlock(&cache->lock);
6148                         spin_unlock(&cache->space_info->lock);
6149                 } else {
6150                         old_val -= num_bytes;
6151                         btrfs_set_block_group_used(&cache->item, old_val);
6152                         cache->pinned += num_bytes;
6153                         update_bytes_pinned(info, cache->space_info, num_bytes);
6154                         cache->space_info->bytes_used -= num_bytes;
6155                         cache->space_info->disk_used -= num_bytes * factor;
6156                         spin_unlock(&cache->lock);
6157                         spin_unlock(&cache->space_info->lock);
6158
6159                         trace_btrfs_space_reservation(info, "pinned",
6160                                                       cache->space_info->flags,
6161                                                       num_bytes, 1);
6162                         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6163                                            num_bytes,
6164                                            BTRFS_TOTAL_BYTES_PINNED_BATCH);
6165                         set_extent_dirty(info->pinned_extents,
6166                                          bytenr, bytenr + num_bytes - 1,
6167                                          GFP_NOFS | __GFP_NOFAIL);
6168                 }
6169
6170                 spin_lock(&trans->transaction->dirty_bgs_lock);
6171                 if (list_empty(&cache->dirty_list)) {
6172                         list_add_tail(&cache->dirty_list,
6173                                       &trans->transaction->dirty_bgs);
6174                         trans->delayed_ref_updates++;
6175                         btrfs_get_block_group(cache);
6176                 }
6177                 spin_unlock(&trans->transaction->dirty_bgs_lock);
6178
6179                 /*
6180                  * No longer have used bytes in this block group, queue it for
6181                  * deletion. We do this after adding the block group to the
6182                  * dirty list to avoid races between cleaner kthread and space
6183                  * cache writeout.
6184                  */
6185                 if (!alloc && old_val == 0)
6186                         btrfs_mark_bg_unused(cache);
6187
6188                 btrfs_put_block_group(cache);
6189                 total -= num_bytes;
6190                 bytenr += num_bytes;
6191         }
6192
6193         /* Modified block groups are accounted for in the delayed_refs_rsv. */
6194         btrfs_update_delayed_refs_rsv(trans);
6195         return ret;
6196 }
6197
6198 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6199 {
6200         struct btrfs_block_group_cache *cache;
6201         u64 bytenr;
6202
6203         spin_lock(&fs_info->block_group_cache_lock);
6204         bytenr = fs_info->first_logical_byte;
6205         spin_unlock(&fs_info->block_group_cache_lock);
6206
6207         if (bytenr < (u64)-1)
6208                 return bytenr;
6209
6210         cache = btrfs_lookup_first_block_group(fs_info, search_start);
6211         if (!cache)
6212                 return 0;
6213
6214         bytenr = cache->key.objectid;
6215         btrfs_put_block_group(cache);
6216
6217         return bytenr;
6218 }
6219
6220 static int pin_down_extent(struct btrfs_block_group_cache *cache,
6221                            u64 bytenr, u64 num_bytes, int reserved)
6222 {
6223         struct btrfs_fs_info *fs_info = cache->fs_info;
6224
6225         spin_lock(&cache->space_info->lock);
6226         spin_lock(&cache->lock);
6227         cache->pinned += num_bytes;
6228         update_bytes_pinned(fs_info, cache->space_info, num_bytes);
6229         if (reserved) {
6230                 cache->reserved -= num_bytes;
6231                 cache->space_info->bytes_reserved -= num_bytes;
6232         }
6233         spin_unlock(&cache->lock);
6234         spin_unlock(&cache->space_info->lock);
6235
6236         trace_btrfs_space_reservation(fs_info, "pinned",
6237                                       cache->space_info->flags, num_bytes, 1);
6238         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6239                     num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6240         set_extent_dirty(fs_info->pinned_extents, bytenr,
6241                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6242         return 0;
6243 }
6244
6245 /*
6246  * this function must be called within transaction
6247  */
6248 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6249                      u64 bytenr, u64 num_bytes, int reserved)
6250 {
6251         struct btrfs_block_group_cache *cache;
6252
6253         cache = btrfs_lookup_block_group(fs_info, bytenr);
6254         BUG_ON(!cache); /* Logic error */
6255
6256         pin_down_extent(cache, bytenr, num_bytes, reserved);
6257
6258         btrfs_put_block_group(cache);
6259         return 0;
6260 }
6261
6262 /*
6263  * this function must be called within transaction
6264  */
6265 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6266                                     u64 bytenr, u64 num_bytes)
6267 {
6268         struct btrfs_block_group_cache *cache;
6269         int ret;
6270
6271         cache = btrfs_lookup_block_group(fs_info, bytenr);
6272         if (!cache)
6273                 return -EINVAL;
6274
6275         /*
6276          * pull in the free space cache (if any) so that our pin
6277          * removes the free space from the cache.  We have load_only set
6278          * to one because the slow code to read in the free extents does check
6279          * the pinned extents.
6280          */
6281         cache_block_group(cache, 1);
6282
6283         pin_down_extent(cache, bytenr, num_bytes, 0);
6284
6285         /* remove us from the free space cache (if we're there at all) */
6286         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6287         btrfs_put_block_group(cache);
6288         return ret;
6289 }
6290
6291 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6292                                    u64 start, u64 num_bytes)
6293 {
6294         int ret;
6295         struct btrfs_block_group_cache *block_group;
6296         struct btrfs_caching_control *caching_ctl;
6297
6298         block_group = btrfs_lookup_block_group(fs_info, start);
6299         if (!block_group)
6300                 return -EINVAL;
6301
6302         cache_block_group(block_group, 0);
6303         caching_ctl = get_caching_control(block_group);
6304
6305         if (!caching_ctl) {
6306                 /* Logic error */
6307                 BUG_ON(!block_group_cache_done(block_group));
6308                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6309         } else {
6310                 mutex_lock(&caching_ctl->mutex);
6311
6312                 if (start >= caching_ctl->progress) {
6313                         ret = add_excluded_extent(fs_info, start, num_bytes);
6314                 } else if (start + num_bytes <= caching_ctl->progress) {
6315                         ret = btrfs_remove_free_space(block_group,
6316                                                       start, num_bytes);
6317                 } else {
6318                         num_bytes = caching_ctl->progress - start;
6319                         ret = btrfs_remove_free_space(block_group,
6320                                                       start, num_bytes);
6321                         if (ret)
6322                                 goto out_lock;
6323
6324                         num_bytes = (start + num_bytes) -
6325                                 caching_ctl->progress;
6326                         start = caching_ctl->progress;
6327                         ret = add_excluded_extent(fs_info, start, num_bytes);
6328                 }
6329 out_lock:
6330                 mutex_unlock(&caching_ctl->mutex);
6331                 put_caching_control(caching_ctl);
6332         }
6333         btrfs_put_block_group(block_group);
6334         return ret;
6335 }
6336
6337 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
6338 {
6339         struct btrfs_fs_info *fs_info = eb->fs_info;
6340         struct btrfs_file_extent_item *item;
6341         struct btrfs_key key;
6342         int found_type;
6343         int i;
6344         int ret = 0;
6345
6346         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6347                 return 0;
6348
6349         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6350                 btrfs_item_key_to_cpu(eb, &key, i);
6351                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6352                         continue;
6353                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6354                 found_type = btrfs_file_extent_type(eb, item);
6355                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6356                         continue;
6357                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6358                         continue;
6359                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6360                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6361                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
6362                 if (ret)
6363                         break;
6364         }
6365
6366         return ret;
6367 }
6368
6369 static void
6370 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6371 {
6372         atomic_inc(&bg->reservations);
6373 }
6374
6375 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6376                                         const u64 start)
6377 {
6378         struct btrfs_block_group_cache *bg;
6379
6380         bg = btrfs_lookup_block_group(fs_info, start);
6381         ASSERT(bg);
6382         if (atomic_dec_and_test(&bg->reservations))
6383                 wake_up_var(&bg->reservations);
6384         btrfs_put_block_group(bg);
6385 }
6386
6387 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6388 {
6389         struct btrfs_space_info *space_info = bg->space_info;
6390
6391         ASSERT(bg->ro);
6392
6393         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6394                 return;
6395
6396         /*
6397          * Our block group is read only but before we set it to read only,
6398          * some task might have had allocated an extent from it already, but it
6399          * has not yet created a respective ordered extent (and added it to a
6400          * root's list of ordered extents).
6401          * Therefore wait for any task currently allocating extents, since the
6402          * block group's reservations counter is incremented while a read lock
6403          * on the groups' semaphore is held and decremented after releasing
6404          * the read access on that semaphore and creating the ordered extent.
6405          */
6406         down_write(&space_info->groups_sem);
6407         up_write(&space_info->groups_sem);
6408
6409         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6410 }
6411
6412 /**
6413  * btrfs_add_reserved_bytes - update the block_group and space info counters
6414  * @cache:      The cache we are manipulating
6415  * @ram_bytes:  The number of bytes of file content, and will be same to
6416  *              @num_bytes except for the compress path.
6417  * @num_bytes:  The number of bytes in question
6418  * @delalloc:   The blocks are allocated for the delalloc write
6419  *
6420  * This is called by the allocator when it reserves space. If this is a
6421  * reservation and the block group has become read only we cannot make the
6422  * reservation and return -EAGAIN, otherwise this function always succeeds.
6423  */
6424 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6425                                     u64 ram_bytes, u64 num_bytes, int delalloc)
6426 {
6427         struct btrfs_space_info *space_info = cache->space_info;
6428         int ret = 0;
6429
6430         spin_lock(&space_info->lock);
6431         spin_lock(&cache->lock);
6432         if (cache->ro) {
6433                 ret = -EAGAIN;
6434         } else {
6435                 cache->reserved += num_bytes;
6436                 space_info->bytes_reserved += num_bytes;
6437                 update_bytes_may_use(cache->fs_info, space_info, -ram_bytes);
6438                 if (delalloc)
6439                         cache->delalloc_bytes += num_bytes;
6440         }
6441         spin_unlock(&cache->lock);
6442         spin_unlock(&space_info->lock);
6443         return ret;
6444 }
6445
6446 /**
6447  * btrfs_free_reserved_bytes - update the block_group and space info counters
6448  * @cache:      The cache we are manipulating
6449  * @num_bytes:  The number of bytes in question
6450  * @delalloc:   The blocks are allocated for the delalloc write
6451  *
6452  * This is called by somebody who is freeing space that was never actually used
6453  * on disk.  For example if you reserve some space for a new leaf in transaction
6454  * A and before transaction A commits you free that leaf, you call this with
6455  * reserve set to 0 in order to clear the reservation.
6456  */
6457
6458 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6459                                       u64 num_bytes, int delalloc)
6460 {
6461         struct btrfs_space_info *space_info = cache->space_info;
6462
6463         spin_lock(&space_info->lock);
6464         spin_lock(&cache->lock);
6465         if (cache->ro)
6466                 space_info->bytes_readonly += num_bytes;
6467         cache->reserved -= num_bytes;
6468         space_info->bytes_reserved -= num_bytes;
6469         space_info->max_extent_size = 0;
6470
6471         if (delalloc)
6472                 cache->delalloc_bytes -= num_bytes;
6473         spin_unlock(&cache->lock);
6474         spin_unlock(&space_info->lock);
6475 }
6476 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6477 {
6478         struct btrfs_caching_control *next;
6479         struct btrfs_caching_control *caching_ctl;
6480         struct btrfs_block_group_cache *cache;
6481
6482         down_write(&fs_info->commit_root_sem);
6483
6484         list_for_each_entry_safe(caching_ctl, next,
6485                                  &fs_info->caching_block_groups, list) {
6486                 cache = caching_ctl->block_group;
6487                 if (block_group_cache_done(cache)) {
6488                         cache->last_byte_to_unpin = (u64)-1;
6489                         list_del_init(&caching_ctl->list);
6490                         put_caching_control(caching_ctl);
6491                 } else {
6492                         cache->last_byte_to_unpin = caching_ctl->progress;
6493                 }
6494         }
6495
6496         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6497                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6498         else
6499                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6500
6501         up_write(&fs_info->commit_root_sem);
6502
6503         update_global_block_rsv(fs_info);
6504 }
6505
6506 /*
6507  * Returns the free cluster for the given space info and sets empty_cluster to
6508  * what it should be based on the mount options.
6509  */
6510 static struct btrfs_free_cluster *
6511 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6512                    struct btrfs_space_info *space_info, u64 *empty_cluster)
6513 {
6514         struct btrfs_free_cluster *ret = NULL;
6515
6516         *empty_cluster = 0;
6517         if (btrfs_mixed_space_info(space_info))
6518                 return ret;
6519
6520         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6521                 ret = &fs_info->meta_alloc_cluster;
6522                 if (btrfs_test_opt(fs_info, SSD))
6523                         *empty_cluster = SZ_2M;
6524                 else
6525                         *empty_cluster = SZ_64K;
6526         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6527                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
6528                 *empty_cluster = SZ_2M;
6529                 ret = &fs_info->data_alloc_cluster;
6530         }
6531
6532         return ret;
6533 }
6534
6535 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6536                               u64 start, u64 end,
6537                               const bool return_free_space)
6538 {
6539         struct btrfs_block_group_cache *cache = NULL;
6540         struct btrfs_space_info *space_info;
6541         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6542         struct btrfs_free_cluster *cluster = NULL;
6543         u64 len;
6544         u64 total_unpinned = 0;
6545         u64 empty_cluster = 0;
6546         bool readonly;
6547
6548         while (start <= end) {
6549                 readonly = false;
6550                 if (!cache ||
6551                     start >= cache->key.objectid + cache->key.offset) {
6552                         if (cache)
6553                                 btrfs_put_block_group(cache);
6554                         total_unpinned = 0;
6555                         cache = btrfs_lookup_block_group(fs_info, start);
6556                         BUG_ON(!cache); /* Logic error */
6557
6558                         cluster = fetch_cluster_info(fs_info,
6559                                                      cache->space_info,
6560                                                      &empty_cluster);
6561                         empty_cluster <<= 1;
6562                 }
6563
6564                 len = cache->key.objectid + cache->key.offset - start;
6565                 len = min(len, end + 1 - start);
6566
6567                 if (start < cache->last_byte_to_unpin) {
6568                         len = min(len, cache->last_byte_to_unpin - start);
6569                         if (return_free_space)
6570                                 btrfs_add_free_space(cache, start, len);
6571                 }
6572
6573                 start += len;
6574                 total_unpinned += len;
6575                 space_info = cache->space_info;
6576
6577                 /*
6578                  * If this space cluster has been marked as fragmented and we've
6579                  * unpinned enough in this block group to potentially allow a
6580                  * cluster to be created inside of it go ahead and clear the
6581                  * fragmented check.
6582                  */
6583                 if (cluster && cluster->fragmented &&
6584                     total_unpinned > empty_cluster) {
6585                         spin_lock(&cluster->lock);
6586                         cluster->fragmented = 0;
6587                         spin_unlock(&cluster->lock);
6588                 }
6589
6590                 spin_lock(&space_info->lock);
6591                 spin_lock(&cache->lock);
6592                 cache->pinned -= len;
6593                 update_bytes_pinned(fs_info, space_info, -len);
6594
6595                 trace_btrfs_space_reservation(fs_info, "pinned",
6596                                               space_info->flags, len, 0);
6597                 space_info->max_extent_size = 0;
6598                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
6599                             -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6600                 if (cache->ro) {
6601                         space_info->bytes_readonly += len;
6602                         readonly = true;
6603                 }
6604                 spin_unlock(&cache->lock);
6605                 if (!readonly && return_free_space &&
6606                     global_rsv->space_info == space_info) {
6607                         u64 to_add = len;
6608
6609                         spin_lock(&global_rsv->lock);
6610                         if (!global_rsv->full) {
6611                                 to_add = min(len, global_rsv->size -
6612                                              global_rsv->reserved);
6613                                 global_rsv->reserved += to_add;
6614                                 update_bytes_may_use(fs_info, space_info,
6615                                                      to_add);
6616                                 if (global_rsv->reserved >= global_rsv->size)
6617                                         global_rsv->full = 1;
6618                                 trace_btrfs_space_reservation(fs_info,
6619                                                               "space_info",
6620                                                               space_info->flags,
6621                                                               to_add, 1);
6622                                 len -= to_add;
6623                         }
6624                         spin_unlock(&global_rsv->lock);
6625                         /* Add to any tickets we may have */
6626                         if (len)
6627                                 btrfs_space_info_add_new_bytes(fs_info,
6628                                                 space_info, len);
6629                 }
6630                 spin_unlock(&space_info->lock);
6631         }
6632
6633         if (cache)
6634                 btrfs_put_block_group(cache);
6635         return 0;
6636 }
6637
6638 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6639 {
6640         struct btrfs_fs_info *fs_info = trans->fs_info;
6641         struct btrfs_block_group_cache *block_group, *tmp;
6642         struct list_head *deleted_bgs;
6643         struct extent_io_tree *unpin;
6644         u64 start;
6645         u64 end;
6646         int ret;
6647
6648         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6649                 unpin = &fs_info->freed_extents[1];
6650         else
6651                 unpin = &fs_info->freed_extents[0];
6652
6653         while (!trans->aborted) {
6654                 struct extent_state *cached_state = NULL;
6655
6656                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6657                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6658                                             EXTENT_DIRTY, &cached_state);
6659                 if (ret) {
6660                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6661                         break;
6662                 }
6663
6664                 if (btrfs_test_opt(fs_info, DISCARD))
6665                         ret = btrfs_discard_extent(fs_info, start,
6666                                                    end + 1 - start, NULL);
6667
6668                 clear_extent_dirty(unpin, start, end, &cached_state);
6669                 unpin_extent_range(fs_info, start, end, true);
6670                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6671                 free_extent_state(cached_state);
6672                 cond_resched();
6673         }
6674
6675         /*
6676          * Transaction is finished.  We don't need the lock anymore.  We
6677          * do need to clean up the block groups in case of a transaction
6678          * abort.
6679          */
6680         deleted_bgs = &trans->transaction->deleted_bgs;
6681         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6682                 u64 trimmed = 0;
6683
6684                 ret = -EROFS;
6685                 if (!trans->aborted)
6686                         ret = btrfs_discard_extent(fs_info,
6687                                                    block_group->key.objectid,
6688                                                    block_group->key.offset,
6689                                                    &trimmed);
6690
6691                 list_del_init(&block_group->bg_list);
6692                 btrfs_put_block_group_trimming(block_group);
6693                 btrfs_put_block_group(block_group);
6694
6695                 if (ret) {
6696                         const char *errstr = btrfs_decode_error(ret);
6697                         btrfs_warn(fs_info,
6698                            "discard failed while removing blockgroup: errno=%d %s",
6699                                    ret, errstr);
6700                 }
6701         }
6702
6703         return 0;
6704 }
6705
6706 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6707                                struct btrfs_delayed_ref_node *node, u64 parent,
6708                                u64 root_objectid, u64 owner_objectid,
6709                                u64 owner_offset, int refs_to_drop,
6710                                struct btrfs_delayed_extent_op *extent_op)
6711 {
6712         struct btrfs_fs_info *info = trans->fs_info;
6713         struct btrfs_key key;
6714         struct btrfs_path *path;
6715         struct btrfs_root *extent_root = info->extent_root;
6716         struct extent_buffer *leaf;
6717         struct btrfs_extent_item *ei;
6718         struct btrfs_extent_inline_ref *iref;
6719         int ret;
6720         int is_data;
6721         int extent_slot = 0;
6722         int found_extent = 0;
6723         int num_to_del = 1;
6724         u32 item_size;
6725         u64 refs;
6726         u64 bytenr = node->bytenr;
6727         u64 num_bytes = node->num_bytes;
6728         int last_ref = 0;
6729         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6730
6731         path = btrfs_alloc_path();
6732         if (!path)
6733                 return -ENOMEM;
6734
6735         path->reada = READA_FORWARD;
6736         path->leave_spinning = 1;
6737
6738         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6739         BUG_ON(!is_data && refs_to_drop != 1);
6740
6741         if (is_data)
6742                 skinny_metadata = false;
6743
6744         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
6745                                     parent, root_objectid, owner_objectid,
6746                                     owner_offset);
6747         if (ret == 0) {
6748                 extent_slot = path->slots[0];
6749                 while (extent_slot >= 0) {
6750                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6751                                               extent_slot);
6752                         if (key.objectid != bytenr)
6753                                 break;
6754                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6755                             key.offset == num_bytes) {
6756                                 found_extent = 1;
6757                                 break;
6758                         }
6759                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6760                             key.offset == owner_objectid) {
6761                                 found_extent = 1;
6762                                 break;
6763                         }
6764                         if (path->slots[0] - extent_slot > 5)
6765                                 break;
6766                         extent_slot--;
6767                 }
6768
6769                 if (!found_extent) {
6770                         BUG_ON(iref);
6771                         ret = remove_extent_backref(trans, path, NULL,
6772                                                     refs_to_drop,
6773                                                     is_data, &last_ref);
6774                         if (ret) {
6775                                 btrfs_abort_transaction(trans, ret);
6776                                 goto out;
6777                         }
6778                         btrfs_release_path(path);
6779                         path->leave_spinning = 1;
6780
6781                         key.objectid = bytenr;
6782                         key.type = BTRFS_EXTENT_ITEM_KEY;
6783                         key.offset = num_bytes;
6784
6785                         if (!is_data && skinny_metadata) {
6786                                 key.type = BTRFS_METADATA_ITEM_KEY;
6787                                 key.offset = owner_objectid;
6788                         }
6789
6790                         ret = btrfs_search_slot(trans, extent_root,
6791                                                 &key, path, -1, 1);
6792                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6793                                 /*
6794                                  * Couldn't find our skinny metadata item,
6795                                  * see if we have ye olde extent item.
6796                                  */
6797                                 path->slots[0]--;
6798                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6799                                                       path->slots[0]);
6800                                 if (key.objectid == bytenr &&
6801                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6802                                     key.offset == num_bytes)
6803                                         ret = 0;
6804                         }
6805
6806                         if (ret > 0 && skinny_metadata) {
6807                                 skinny_metadata = false;
6808                                 key.objectid = bytenr;
6809                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6810                                 key.offset = num_bytes;
6811                                 btrfs_release_path(path);
6812                                 ret = btrfs_search_slot(trans, extent_root,
6813                                                         &key, path, -1, 1);
6814                         }
6815
6816                         if (ret) {
6817                                 btrfs_err(info,
6818                                           "umm, got %d back from search, was looking for %llu",
6819                                           ret, bytenr);
6820                                 if (ret > 0)
6821                                         btrfs_print_leaf(path->nodes[0]);
6822                         }
6823                         if (ret < 0) {
6824                                 btrfs_abort_transaction(trans, ret);
6825                                 goto out;
6826                         }
6827                         extent_slot = path->slots[0];
6828                 }
6829         } else if (WARN_ON(ret == -ENOENT)) {
6830                 btrfs_print_leaf(path->nodes[0]);
6831                 btrfs_err(info,
6832                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6833                         bytenr, parent, root_objectid, owner_objectid,
6834                         owner_offset);
6835                 btrfs_abort_transaction(trans, ret);
6836                 goto out;
6837         } else {
6838                 btrfs_abort_transaction(trans, ret);
6839                 goto out;
6840         }
6841
6842         leaf = path->nodes[0];
6843         item_size = btrfs_item_size_nr(leaf, extent_slot);
6844         if (unlikely(item_size < sizeof(*ei))) {
6845                 ret = -EINVAL;
6846                 btrfs_print_v0_err(info);
6847                 btrfs_abort_transaction(trans, ret);
6848                 goto out;
6849         }
6850         ei = btrfs_item_ptr(leaf, extent_slot,
6851                             struct btrfs_extent_item);
6852         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6853             key.type == BTRFS_EXTENT_ITEM_KEY) {
6854                 struct btrfs_tree_block_info *bi;
6855                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6856                 bi = (struct btrfs_tree_block_info *)(ei + 1);
6857                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6858         }
6859
6860         refs = btrfs_extent_refs(leaf, ei);
6861         if (refs < refs_to_drop) {
6862                 btrfs_err(info,
6863                           "trying to drop %d refs but we only have %Lu for bytenr %Lu",
6864                           refs_to_drop, refs, bytenr);
6865                 ret = -EINVAL;
6866                 btrfs_abort_transaction(trans, ret);
6867                 goto out;
6868         }
6869         refs -= refs_to_drop;
6870
6871         if (refs > 0) {
6872                 if (extent_op)
6873                         __run_delayed_extent_op(extent_op, leaf, ei);
6874                 /*
6875                  * In the case of inline back ref, reference count will
6876                  * be updated by remove_extent_backref
6877                  */
6878                 if (iref) {
6879                         BUG_ON(!found_extent);
6880                 } else {
6881                         btrfs_set_extent_refs(leaf, ei, refs);
6882                         btrfs_mark_buffer_dirty(leaf);
6883                 }
6884                 if (found_extent) {
6885                         ret = remove_extent_backref(trans, path, iref,
6886                                                     refs_to_drop, is_data,
6887                                                     &last_ref);
6888                         if (ret) {
6889                                 btrfs_abort_transaction(trans, ret);
6890                                 goto out;
6891                         }
6892                 }
6893         } else {
6894                 if (found_extent) {
6895                         BUG_ON(is_data && refs_to_drop !=
6896                                extent_data_ref_count(path, iref));
6897                         if (iref) {
6898                                 BUG_ON(path->slots[0] != extent_slot);
6899                         } else {
6900                                 BUG_ON(path->slots[0] != extent_slot + 1);
6901                                 path->slots[0] = extent_slot;
6902                                 num_to_del = 2;
6903                         }
6904                 }
6905
6906                 last_ref = 1;
6907                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6908                                       num_to_del);
6909                 if (ret) {
6910                         btrfs_abort_transaction(trans, ret);
6911                         goto out;
6912                 }
6913                 btrfs_release_path(path);
6914
6915                 if (is_data) {
6916                         ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
6917                         if (ret) {
6918                                 btrfs_abort_transaction(trans, ret);
6919                                 goto out;
6920                         }
6921                 }
6922
6923                 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
6924                 if (ret) {
6925                         btrfs_abort_transaction(trans, ret);
6926                         goto out;
6927                 }
6928
6929                 ret = update_block_group(trans, bytenr, num_bytes, 0);
6930                 if (ret) {
6931                         btrfs_abort_transaction(trans, ret);
6932                         goto out;
6933                 }
6934         }
6935         btrfs_release_path(path);
6936
6937 out:
6938         btrfs_free_path(path);
6939         return ret;
6940 }
6941
6942 /*
6943  * when we free an block, it is possible (and likely) that we free the last
6944  * delayed ref for that extent as well.  This searches the delayed ref tree for
6945  * a given extent, and if there are no other delayed refs to be processed, it
6946  * removes it from the tree.
6947  */
6948 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6949                                       u64 bytenr)
6950 {
6951         struct btrfs_delayed_ref_head *head;
6952         struct btrfs_delayed_ref_root *delayed_refs;
6953         int ret = 0;
6954
6955         delayed_refs = &trans->transaction->delayed_refs;
6956         spin_lock(&delayed_refs->lock);
6957         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
6958         if (!head)
6959                 goto out_delayed_unlock;
6960
6961         spin_lock(&head->lock);
6962         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
6963                 goto out;
6964
6965         if (cleanup_extent_op(head) != NULL)
6966                 goto out;
6967
6968         /*
6969          * waiting for the lock here would deadlock.  If someone else has it
6970          * locked they are already in the process of dropping it anyway
6971          */
6972         if (!mutex_trylock(&head->mutex))
6973                 goto out;
6974
6975         btrfs_delete_ref_head(delayed_refs, head);
6976         head->processing = 0;
6977
6978         spin_unlock(&head->lock);
6979         spin_unlock(&delayed_refs->lock);
6980
6981         BUG_ON(head->extent_op);
6982         if (head->must_insert_reserved)
6983                 ret = 1;
6984
6985         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
6986         mutex_unlock(&head->mutex);
6987         btrfs_put_delayed_ref_head(head);
6988         return ret;
6989 out:
6990         spin_unlock(&head->lock);
6991
6992 out_delayed_unlock:
6993         spin_unlock(&delayed_refs->lock);
6994         return 0;
6995 }
6996
6997 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6998                            struct btrfs_root *root,
6999                            struct extent_buffer *buf,
7000                            u64 parent, int last_ref)
7001 {
7002         struct btrfs_fs_info *fs_info = root->fs_info;
7003         struct btrfs_ref generic_ref = { 0 };
7004         int pin = 1;
7005         int ret;
7006
7007         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
7008                                buf->start, buf->len, parent);
7009         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
7010                             root->root_key.objectid);
7011
7012         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7013                 int old_ref_mod, new_ref_mod;
7014
7015                 btrfs_ref_tree_mod(fs_info, &generic_ref);
7016                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
7017                                                  &old_ref_mod, &new_ref_mod);
7018                 BUG_ON(ret); /* -ENOMEM */
7019                 pin = old_ref_mod >= 0 && new_ref_mod < 0;
7020         }
7021
7022         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7023                 struct btrfs_block_group_cache *cache;
7024
7025                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7026                         ret = check_ref_cleanup(trans, buf->start);
7027                         if (!ret)
7028                                 goto out;
7029                 }
7030
7031                 pin = 0;
7032                 cache = btrfs_lookup_block_group(fs_info, buf->start);
7033
7034                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7035                         pin_down_extent(cache, buf->start, buf->len, 1);
7036                         btrfs_put_block_group(cache);
7037                         goto out;
7038                 }
7039
7040                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7041
7042                 btrfs_add_free_space(cache, buf->start, buf->len);
7043                 btrfs_free_reserved_bytes(cache, buf->len, 0);
7044                 btrfs_put_block_group(cache);
7045                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7046         }
7047 out:
7048         if (pin)
7049                 add_pinned_bytes(fs_info, &generic_ref);
7050
7051         if (last_ref) {
7052                 /*
7053                  * Deleting the buffer, clear the corrupt flag since it doesn't
7054                  * matter anymore.
7055                  */
7056                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7057         }
7058 }
7059
7060 /* Can return -ENOMEM */
7061 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
7062 {
7063         struct btrfs_fs_info *fs_info = trans->fs_info;
7064         int old_ref_mod, new_ref_mod;
7065         int ret;
7066
7067         if (btrfs_is_testing(fs_info))
7068                 return 0;
7069
7070         /*
7071          * tree log blocks never actually go into the extent allocation
7072          * tree, just update pinning info and exit early.
7073          */
7074         if ((ref->type == BTRFS_REF_METADATA &&
7075              ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
7076             (ref->type == BTRFS_REF_DATA &&
7077              ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
7078                 /* unlocks the pinned mutex */
7079                 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
7080                 old_ref_mod = new_ref_mod = 0;
7081                 ret = 0;
7082         } else if (ref->type == BTRFS_REF_METADATA) {
7083                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
7084                                                  &old_ref_mod, &new_ref_mod);
7085         } else {
7086                 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
7087                                                  &old_ref_mod, &new_ref_mod);
7088         }
7089
7090         if (!((ref->type == BTRFS_REF_METADATA &&
7091                ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
7092               (ref->type == BTRFS_REF_DATA &&
7093                ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
7094                 btrfs_ref_tree_mod(fs_info, ref);
7095
7096         if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
7097                 add_pinned_bytes(fs_info, ref);
7098
7099         return ret;
7100 }
7101
7102 /*
7103  * when we wait for progress in the block group caching, its because
7104  * our allocation attempt failed at least once.  So, we must sleep
7105  * and let some progress happen before we try again.
7106  *
7107  * This function will sleep at least once waiting for new free space to
7108  * show up, and then it will check the block group free space numbers
7109  * for our min num_bytes.  Another option is to have it go ahead
7110  * and look in the rbtree for a free extent of a given size, but this
7111  * is a good start.
7112  *
7113  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7114  * any of the information in this block group.
7115  */
7116 static noinline void
7117 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7118                                 u64 num_bytes)
7119 {
7120         struct btrfs_caching_control *caching_ctl;
7121
7122         caching_ctl = get_caching_control(cache);
7123         if (!caching_ctl)
7124                 return;
7125
7126         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7127                    (cache->free_space_ctl->free_space >= num_bytes));
7128
7129         put_caching_control(caching_ctl);
7130 }
7131
7132 static noinline int
7133 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7134 {
7135         struct btrfs_caching_control *caching_ctl;
7136         int ret = 0;
7137
7138         caching_ctl = get_caching_control(cache);
7139         if (!caching_ctl)
7140                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7141
7142         wait_event(caching_ctl->wait, block_group_cache_done(cache));
7143         if (cache->cached == BTRFS_CACHE_ERROR)
7144                 ret = -EIO;
7145         put_caching_control(caching_ctl);
7146         return ret;
7147 }
7148
7149 enum btrfs_loop_type {
7150         LOOP_CACHING_NOWAIT,
7151         LOOP_CACHING_WAIT,
7152         LOOP_ALLOC_CHUNK,
7153         LOOP_NO_EMPTY_SIZE,
7154 };
7155
7156 static inline void
7157 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7158                        int delalloc)
7159 {
7160         if (delalloc)
7161                 down_read(&cache->data_rwsem);
7162 }
7163
7164 static inline void
7165 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7166                        int delalloc)
7167 {
7168         btrfs_get_block_group(cache);
7169         if (delalloc)
7170                 down_read(&cache->data_rwsem);
7171 }
7172
7173 static struct btrfs_block_group_cache *
7174 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7175                    struct btrfs_free_cluster *cluster,
7176                    int delalloc)
7177 {
7178         struct btrfs_block_group_cache *used_bg = NULL;
7179
7180         spin_lock(&cluster->refill_lock);
7181         while (1) {
7182                 used_bg = cluster->block_group;
7183                 if (!used_bg)
7184                         return NULL;
7185
7186                 if (used_bg == block_group)
7187                         return used_bg;
7188
7189                 btrfs_get_block_group(used_bg);
7190
7191                 if (!delalloc)
7192                         return used_bg;
7193
7194                 if (down_read_trylock(&used_bg->data_rwsem))
7195                         return used_bg;
7196
7197                 spin_unlock(&cluster->refill_lock);
7198
7199                 /* We should only have one-level nested. */
7200                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7201
7202                 spin_lock(&cluster->refill_lock);
7203                 if (used_bg == cluster->block_group)
7204                         return used_bg;
7205
7206                 up_read(&used_bg->data_rwsem);
7207                 btrfs_put_block_group(used_bg);
7208         }
7209 }
7210
7211 static inline void
7212 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7213                          int delalloc)
7214 {
7215         if (delalloc)
7216                 up_read(&cache->data_rwsem);
7217         btrfs_put_block_group(cache);
7218 }
7219
7220 /*
7221  * Structure used internally for find_free_extent() function.  Wraps needed
7222  * parameters.
7223  */
7224 struct find_free_extent_ctl {
7225         /* Basic allocation info */
7226         u64 ram_bytes;
7227         u64 num_bytes;
7228         u64 empty_size;
7229         u64 flags;
7230         int delalloc;
7231
7232         /* Where to start the search inside the bg */
7233         u64 search_start;
7234
7235         /* For clustered allocation */
7236         u64 empty_cluster;
7237
7238         bool have_caching_bg;
7239         bool orig_have_caching_bg;
7240
7241         /* RAID index, converted from flags */
7242         int index;
7243
7244         /*
7245          * Current loop number, check find_free_extent_update_loop() for details
7246          */
7247         int loop;
7248
7249         /*
7250          * Whether we're refilling a cluster, if true we need to re-search
7251          * current block group but don't try to refill the cluster again.
7252          */
7253         bool retry_clustered;
7254
7255         /*
7256          * Whether we're updating free space cache, if true we need to re-search
7257          * current block group but don't try updating free space cache again.
7258          */
7259         bool retry_unclustered;
7260
7261         /* If current block group is cached */
7262         int cached;
7263
7264         /* Max contiguous hole found */
7265         u64 max_extent_size;
7266
7267         /* Total free space from free space cache, not always contiguous */
7268         u64 total_free_space;
7269
7270         /* Found result */
7271         u64 found_offset;
7272 };
7273
7274
7275 /*
7276  * Helper function for find_free_extent().
7277  *
7278  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
7279  * Return -EAGAIN to inform caller that we need to re-search this block group
7280  * Return >0 to inform caller that we find nothing
7281  * Return 0 means we have found a location and set ffe_ctl->found_offset.
7282  */
7283 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
7284                 struct btrfs_free_cluster *last_ptr,
7285                 struct find_free_extent_ctl *ffe_ctl,
7286                 struct btrfs_block_group_cache **cluster_bg_ret)
7287 {
7288         struct btrfs_block_group_cache *cluster_bg;
7289         u64 aligned_cluster;
7290         u64 offset;
7291         int ret;
7292
7293         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
7294         if (!cluster_bg)
7295                 goto refill_cluster;
7296         if (cluster_bg != bg && (cluster_bg->ro ||
7297             !block_group_bits(cluster_bg, ffe_ctl->flags)))
7298                 goto release_cluster;
7299
7300         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
7301                         ffe_ctl->num_bytes, cluster_bg->key.objectid,
7302                         &ffe_ctl->max_extent_size);
7303         if (offset) {
7304                 /* We have a block, we're done */
7305                 spin_unlock(&last_ptr->refill_lock);
7306                 trace_btrfs_reserve_extent_cluster(cluster_bg,
7307                                 ffe_ctl->search_start, ffe_ctl->num_bytes);
7308                 *cluster_bg_ret = cluster_bg;
7309                 ffe_ctl->found_offset = offset;
7310                 return 0;
7311         }
7312         WARN_ON(last_ptr->block_group != cluster_bg);
7313
7314 release_cluster:
7315         /*
7316          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
7317          * lets just skip it and let the allocator find whatever block it can
7318          * find. If we reach this point, we will have tried the cluster
7319          * allocator plenty of times and not have found anything, so we are
7320          * likely way too fragmented for the clustering stuff to find anything.
7321          *
7322          * However, if the cluster is taken from the current block group,
7323          * release the cluster first, so that we stand a better chance of
7324          * succeeding in the unclustered allocation.
7325          */
7326         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
7327                 spin_unlock(&last_ptr->refill_lock);
7328                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7329                 return -ENOENT;
7330         }
7331
7332         /* This cluster didn't work out, free it and start over */
7333         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7334
7335         if (cluster_bg != bg)
7336                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7337
7338 refill_cluster:
7339         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
7340                 spin_unlock(&last_ptr->refill_lock);
7341                 return -ENOENT;
7342         }
7343
7344         aligned_cluster = max_t(u64,
7345                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
7346                         bg->full_stripe_len);
7347         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
7348                         ffe_ctl->num_bytes, aligned_cluster);
7349         if (ret == 0) {
7350                 /* Now pull our allocation out of this cluster */
7351                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
7352                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
7353                                 &ffe_ctl->max_extent_size);
7354                 if (offset) {
7355                         /* We found one, proceed */
7356                         spin_unlock(&last_ptr->refill_lock);
7357                         trace_btrfs_reserve_extent_cluster(bg,
7358                                         ffe_ctl->search_start,
7359                                         ffe_ctl->num_bytes);
7360                         ffe_ctl->found_offset = offset;
7361                         return 0;
7362                 }
7363         } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
7364                    !ffe_ctl->retry_clustered) {
7365                 spin_unlock(&last_ptr->refill_lock);
7366
7367                 ffe_ctl->retry_clustered = true;
7368                 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7369                                 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
7370                 return -EAGAIN;
7371         }
7372         /*
7373          * At this point we either didn't find a cluster or we weren't able to
7374          * allocate a block from our cluster.  Free the cluster we've been
7375          * trying to use, and go to the next block group.
7376          */
7377         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7378         spin_unlock(&last_ptr->refill_lock);
7379         return 1;
7380 }
7381
7382 /*
7383  * Return >0 to inform caller that we find nothing
7384  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
7385  * Return -EAGAIN to inform caller that we need to re-search this block group
7386  */
7387 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
7388                 struct btrfs_free_cluster *last_ptr,
7389                 struct find_free_extent_ctl *ffe_ctl)
7390 {
7391         u64 offset;
7392
7393         /*
7394          * We are doing an unclustered allocation, set the fragmented flag so
7395          * we don't bother trying to setup a cluster again until we get more
7396          * space.
7397          */
7398         if (unlikely(last_ptr)) {
7399                 spin_lock(&last_ptr->lock);
7400                 last_ptr->fragmented = 1;
7401                 spin_unlock(&last_ptr->lock);
7402         }
7403         if (ffe_ctl->cached) {
7404                 struct btrfs_free_space_ctl *free_space_ctl;
7405
7406                 free_space_ctl = bg->free_space_ctl;
7407                 spin_lock(&free_space_ctl->tree_lock);
7408                 if (free_space_ctl->free_space <
7409                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
7410                     ffe_ctl->empty_size) {
7411                         ffe_ctl->total_free_space = max_t(u64,
7412                                         ffe_ctl->total_free_space,
7413                                         free_space_ctl->free_space);
7414                         spin_unlock(&free_space_ctl->tree_lock);
7415                         return 1;
7416                 }
7417                 spin_unlock(&free_space_ctl->tree_lock);
7418         }
7419
7420         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
7421                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
7422                         &ffe_ctl->max_extent_size);
7423
7424         /*
7425          * If we didn't find a chunk, and we haven't failed on this block group
7426          * before, and this block group is in the middle of caching and we are
7427          * ok with waiting, then go ahead and wait for progress to be made, and
7428          * set @retry_unclustered to true.
7429          *
7430          * If @retry_unclustered is true then we've already waited on this
7431          * block group once and should move on to the next block group.
7432          */
7433         if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
7434             ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
7435                 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7436                                                 ffe_ctl->empty_size);
7437                 ffe_ctl->retry_unclustered = true;
7438                 return -EAGAIN;
7439         } else if (!offset) {
7440                 return 1;
7441         }
7442         ffe_ctl->found_offset = offset;
7443         return 0;
7444 }
7445
7446 /*
7447  * Return >0 means caller needs to re-search for free extent
7448  * Return 0 means we have the needed free extent.
7449  * Return <0 means we failed to locate any free extent.
7450  */
7451 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
7452                                         struct btrfs_free_cluster *last_ptr,
7453                                         struct btrfs_key *ins,
7454                                         struct find_free_extent_ctl *ffe_ctl,
7455                                         int full_search, bool use_cluster)
7456 {
7457         struct btrfs_root *root = fs_info->extent_root;
7458         int ret;
7459
7460         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
7461             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
7462                 ffe_ctl->orig_have_caching_bg = true;
7463
7464         if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
7465             ffe_ctl->have_caching_bg)
7466                 return 1;
7467
7468         if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
7469                 return 1;
7470
7471         if (ins->objectid) {
7472                 if (!use_cluster && last_ptr) {
7473                         spin_lock(&last_ptr->lock);
7474                         last_ptr->window_start = ins->objectid;
7475                         spin_unlock(&last_ptr->lock);
7476                 }
7477                 return 0;
7478         }
7479
7480         /*
7481          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7482          *                      caching kthreads as we move along
7483          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7484          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7485          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7486          *                     again
7487          */
7488         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
7489                 ffe_ctl->index = 0;
7490                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
7491                         /*
7492                          * We want to skip the LOOP_CACHING_WAIT step if we
7493                          * don't have any uncached bgs and we've already done a
7494                          * full search through.
7495                          */
7496                         if (ffe_ctl->orig_have_caching_bg || !full_search)
7497                                 ffe_ctl->loop = LOOP_CACHING_WAIT;
7498                         else
7499                                 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
7500                 } else {
7501                         ffe_ctl->loop++;
7502                 }
7503
7504                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
7505                         struct btrfs_trans_handle *trans;
7506                         int exist = 0;
7507
7508                         trans = current->journal_info;
7509                         if (trans)
7510                                 exist = 1;
7511                         else
7512                                 trans = btrfs_join_transaction(root);
7513
7514                         if (IS_ERR(trans)) {
7515                                 ret = PTR_ERR(trans);
7516                                 return ret;
7517                         }
7518
7519                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
7520                                                 CHUNK_ALLOC_FORCE);
7521
7522                         /*
7523                          * If we can't allocate a new chunk we've already looped
7524                          * through at least once, move on to the NO_EMPTY_SIZE
7525                          * case.
7526                          */
7527                         if (ret == -ENOSPC)
7528                                 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
7529
7530                         /* Do not bail out on ENOSPC since we can do more. */
7531                         if (ret < 0 && ret != -ENOSPC)
7532                                 btrfs_abort_transaction(trans, ret);
7533                         else
7534                                 ret = 0;
7535                         if (!exist)
7536                                 btrfs_end_transaction(trans);
7537                         if (ret)
7538                                 return ret;
7539                 }
7540
7541                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
7542                         /*
7543                          * Don't loop again if we already have no empty_size and
7544                          * no empty_cluster.
7545                          */
7546                         if (ffe_ctl->empty_size == 0 &&
7547                             ffe_ctl->empty_cluster == 0)
7548                                 return -ENOSPC;
7549                         ffe_ctl->empty_size = 0;
7550                         ffe_ctl->empty_cluster = 0;
7551                 }
7552                 return 1;
7553         }
7554         return -ENOSPC;
7555 }
7556
7557 /*
7558  * walks the btree of allocated extents and find a hole of a given size.
7559  * The key ins is changed to record the hole:
7560  * ins->objectid == start position
7561  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7562  * ins->offset == the size of the hole.
7563  * Any available blocks before search_start are skipped.
7564  *
7565  * If there is no suitable free space, we will record the max size of
7566  * the free space extent currently.
7567  *
7568  * The overall logic and call chain:
7569  *
7570  * find_free_extent()
7571  * |- Iterate through all block groups
7572  * |  |- Get a valid block group
7573  * |  |- Try to do clustered allocation in that block group
7574  * |  |- Try to do unclustered allocation in that block group
7575  * |  |- Check if the result is valid
7576  * |  |  |- If valid, then exit
7577  * |  |- Jump to next block group
7578  * |
7579  * |- Push harder to find free extents
7580  *    |- If not found, re-iterate all block groups
7581  */
7582 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7583                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7584                                 u64 hint_byte, struct btrfs_key *ins,
7585                                 u64 flags, int delalloc)
7586 {
7587         int ret = 0;
7588         struct btrfs_free_cluster *last_ptr = NULL;
7589         struct btrfs_block_group_cache *block_group = NULL;
7590         struct find_free_extent_ctl ffe_ctl = {0};
7591         struct btrfs_space_info *space_info;
7592         bool use_cluster = true;
7593         bool full_search = false;
7594
7595         WARN_ON(num_bytes < fs_info->sectorsize);
7596
7597         ffe_ctl.ram_bytes = ram_bytes;
7598         ffe_ctl.num_bytes = num_bytes;
7599         ffe_ctl.empty_size = empty_size;
7600         ffe_ctl.flags = flags;
7601         ffe_ctl.search_start = 0;
7602         ffe_ctl.retry_clustered = false;
7603         ffe_ctl.retry_unclustered = false;
7604         ffe_ctl.delalloc = delalloc;
7605         ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
7606         ffe_ctl.have_caching_bg = false;
7607         ffe_ctl.orig_have_caching_bg = false;
7608         ffe_ctl.found_offset = 0;
7609
7610         ins->type = BTRFS_EXTENT_ITEM_KEY;
7611         ins->objectid = 0;
7612         ins->offset = 0;
7613
7614         trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7615
7616         space_info = btrfs_find_space_info(fs_info, flags);
7617         if (!space_info) {
7618                 btrfs_err(fs_info, "No space info for %llu", flags);
7619                 return -ENOSPC;
7620         }
7621
7622         /*
7623          * If our free space is heavily fragmented we may not be able to make
7624          * big contiguous allocations, so instead of doing the expensive search
7625          * for free space, simply return ENOSPC with our max_extent_size so we
7626          * can go ahead and search for a more manageable chunk.
7627          *
7628          * If our max_extent_size is large enough for our allocation simply
7629          * disable clustering since we will likely not be able to find enough
7630          * space to create a cluster and induce latency trying.
7631          */
7632         if (unlikely(space_info->max_extent_size)) {
7633                 spin_lock(&space_info->lock);
7634                 if (space_info->max_extent_size &&
7635                     num_bytes > space_info->max_extent_size) {
7636                         ins->offset = space_info->max_extent_size;
7637                         spin_unlock(&space_info->lock);
7638                         return -ENOSPC;
7639                 } else if (space_info->max_extent_size) {
7640                         use_cluster = false;
7641                 }
7642                 spin_unlock(&space_info->lock);
7643         }
7644
7645         last_ptr = fetch_cluster_info(fs_info, space_info,
7646                                       &ffe_ctl.empty_cluster);
7647         if (last_ptr) {
7648                 spin_lock(&last_ptr->lock);
7649                 if (last_ptr->block_group)
7650                         hint_byte = last_ptr->window_start;
7651                 if (last_ptr->fragmented) {
7652                         /*
7653                          * We still set window_start so we can keep track of the
7654                          * last place we found an allocation to try and save
7655                          * some time.
7656                          */
7657                         hint_byte = last_ptr->window_start;
7658                         use_cluster = false;
7659                 }
7660                 spin_unlock(&last_ptr->lock);
7661         }
7662
7663         ffe_ctl.search_start = max(ffe_ctl.search_start,
7664                                    first_logical_byte(fs_info, 0));
7665         ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
7666         if (ffe_ctl.search_start == hint_byte) {
7667                 block_group = btrfs_lookup_block_group(fs_info,
7668                                                        ffe_ctl.search_start);
7669                 /*
7670                  * we don't want to use the block group if it doesn't match our
7671                  * allocation bits, or if its not cached.
7672                  *
7673                  * However if we are re-searching with an ideal block group
7674                  * picked out then we don't care that the block group is cached.
7675                  */
7676                 if (block_group && block_group_bits(block_group, flags) &&
7677                     block_group->cached != BTRFS_CACHE_NO) {
7678                         down_read(&space_info->groups_sem);
7679                         if (list_empty(&block_group->list) ||
7680                             block_group->ro) {
7681                                 /*
7682                                  * someone is removing this block group,
7683                                  * we can't jump into the have_block_group
7684                                  * target because our list pointers are not
7685                                  * valid
7686                                  */
7687                                 btrfs_put_block_group(block_group);
7688                                 up_read(&space_info->groups_sem);
7689                         } else {
7690                                 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
7691                                                 block_group->flags);
7692                                 btrfs_lock_block_group(block_group, delalloc);
7693                                 goto have_block_group;
7694                         }
7695                 } else if (block_group) {
7696                         btrfs_put_block_group(block_group);
7697                 }
7698         }
7699 search:
7700         ffe_ctl.have_caching_bg = false;
7701         if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
7702             ffe_ctl.index == 0)
7703                 full_search = true;
7704         down_read(&space_info->groups_sem);
7705         list_for_each_entry(block_group,
7706                             &space_info->block_groups[ffe_ctl.index], list) {
7707                 /* If the block group is read-only, we can skip it entirely. */
7708                 if (unlikely(block_group->ro))
7709                         continue;
7710
7711                 btrfs_grab_block_group(block_group, delalloc);
7712                 ffe_ctl.search_start = block_group->key.objectid;
7713
7714                 /*
7715                  * this can happen if we end up cycling through all the
7716                  * raid types, but we want to make sure we only allocate
7717                  * for the proper type.
7718                  */
7719                 if (!block_group_bits(block_group, flags)) {
7720                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
7721                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
7722                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
7723                                 BTRFS_BLOCK_GROUP_RAID10;
7724
7725                         /*
7726                          * if they asked for extra copies and this block group
7727                          * doesn't provide them, bail.  This does allow us to
7728                          * fill raid0 from raid1.
7729                          */
7730                         if ((flags & extra) && !(block_group->flags & extra))
7731                                 goto loop;
7732                 }
7733
7734 have_block_group:
7735                 ffe_ctl.cached = block_group_cache_done(block_group);
7736                 if (unlikely(!ffe_ctl.cached)) {
7737                         ffe_ctl.have_caching_bg = true;
7738                         ret = cache_block_group(block_group, 0);
7739                         BUG_ON(ret < 0);
7740                         ret = 0;
7741                 }
7742
7743                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7744                         goto loop;
7745
7746                 /*
7747                  * Ok we want to try and use the cluster allocator, so
7748                  * lets look there
7749                  */
7750                 if (last_ptr && use_cluster) {
7751                         struct btrfs_block_group_cache *cluster_bg = NULL;
7752
7753                         ret = find_free_extent_clustered(block_group, last_ptr,
7754                                                          &ffe_ctl, &cluster_bg);
7755
7756                         if (ret == 0) {
7757                                 if (cluster_bg && cluster_bg != block_group) {
7758                                         btrfs_release_block_group(block_group,
7759                                                                   delalloc);
7760                                         block_group = cluster_bg;
7761                                 }
7762                                 goto checks;
7763                         } else if (ret == -EAGAIN) {
7764                                 goto have_block_group;
7765                         } else if (ret > 0) {
7766                                 goto loop;
7767                         }
7768                         /* ret == -ENOENT case falls through */
7769                 }
7770
7771                 ret = find_free_extent_unclustered(block_group, last_ptr,
7772                                                    &ffe_ctl);
7773                 if (ret == -EAGAIN)
7774                         goto have_block_group;
7775                 else if (ret > 0)
7776                         goto loop;
7777                 /* ret == 0 case falls through */
7778 checks:
7779                 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
7780                                              fs_info->stripesize);
7781
7782                 /* move on to the next group */
7783                 if (ffe_ctl.search_start + num_bytes >
7784                     block_group->key.objectid + block_group->key.offset) {
7785                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7786                                              num_bytes);
7787                         goto loop;
7788                 }
7789
7790                 if (ffe_ctl.found_offset < ffe_ctl.search_start)
7791                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7792                                 ffe_ctl.search_start - ffe_ctl.found_offset);
7793
7794                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7795                                 num_bytes, delalloc);
7796                 if (ret == -EAGAIN) {
7797                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
7798                                              num_bytes);
7799                         goto loop;
7800                 }
7801                 btrfs_inc_block_group_reservations(block_group);
7802
7803                 /* we are all good, lets return */
7804                 ins->objectid = ffe_ctl.search_start;
7805                 ins->offset = num_bytes;
7806
7807                 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
7808                                            num_bytes);
7809                 btrfs_release_block_group(block_group, delalloc);
7810                 break;
7811 loop:
7812                 ffe_ctl.retry_clustered = false;
7813                 ffe_ctl.retry_unclustered = false;
7814                 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7815                        ffe_ctl.index);
7816                 btrfs_release_block_group(block_group, delalloc);
7817                 cond_resched();
7818         }
7819         up_read(&space_info->groups_sem);
7820
7821         ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
7822                                            full_search, use_cluster);
7823         if (ret > 0)
7824                 goto search;
7825
7826         if (ret == -ENOSPC) {
7827                 /*
7828                  * Use ffe_ctl->total_free_space as fallback if we can't find
7829                  * any contiguous hole.
7830                  */
7831                 if (!ffe_ctl.max_extent_size)
7832                         ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
7833                 spin_lock(&space_info->lock);
7834                 space_info->max_extent_size = ffe_ctl.max_extent_size;
7835                 spin_unlock(&space_info->lock);
7836                 ins->offset = ffe_ctl.max_extent_size;
7837         }
7838         return ret;
7839 }
7840
7841 #define DUMP_BLOCK_RSV(fs_info, rsv_name)                               \
7842 do {                                                                    \
7843         struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
7844         spin_lock(&__rsv->lock);                                        \
7845         btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
7846                    __rsv->size, __rsv->reserved);                       \
7847         spin_unlock(&__rsv->lock);                                      \
7848 } while (0)
7849
7850 static void dump_space_info(struct btrfs_fs_info *fs_info,
7851                             struct btrfs_space_info *info, u64 bytes,
7852                             int dump_block_groups)
7853 {
7854         struct btrfs_block_group_cache *cache;
7855         int index = 0;
7856
7857         spin_lock(&info->lock);
7858         btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7859                    info->flags,
7860                    info->total_bytes - btrfs_space_info_used(info, true),
7861                    info->full ? "" : "not ");
7862         btrfs_info(fs_info,
7863                 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7864                 info->total_bytes, info->bytes_used, info->bytes_pinned,
7865                 info->bytes_reserved, info->bytes_may_use,
7866                 info->bytes_readonly);
7867         spin_unlock(&info->lock);
7868
7869         DUMP_BLOCK_RSV(fs_info, global_block_rsv);
7870         DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
7871         DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
7872         DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
7873         DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
7874
7875         if (!dump_block_groups)
7876                 return;
7877
7878         down_read(&info->groups_sem);
7879 again:
7880         list_for_each_entry(cache, &info->block_groups[index], list) {
7881                 spin_lock(&cache->lock);
7882                 btrfs_info(fs_info,
7883                         "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7884                         cache->key.objectid, cache->key.offset,
7885                         btrfs_block_group_used(&cache->item), cache->pinned,
7886                         cache->reserved, cache->ro ? "[readonly]" : "");
7887                 btrfs_dump_free_space(cache, bytes);
7888                 spin_unlock(&cache->lock);
7889         }
7890         if (++index < BTRFS_NR_RAID_TYPES)
7891                 goto again;
7892         up_read(&info->groups_sem);
7893 }
7894
7895 /*
7896  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
7897  *                        hole that is at least as big as @num_bytes.
7898  *
7899  * @root           -    The root that will contain this extent
7900  *
7901  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
7902  *                      is used for accounting purposes. This value differs
7903  *                      from @num_bytes only in the case of compressed extents.
7904  *
7905  * @num_bytes      -    Number of bytes to allocate on-disk.
7906  *
7907  * @min_alloc_size -    Indicates the minimum amount of space that the
7908  *                      allocator should try to satisfy. In some cases
7909  *                      @num_bytes may be larger than what is required and if
7910  *                      the filesystem is fragmented then allocation fails.
7911  *                      However, the presence of @min_alloc_size gives a
7912  *                      chance to try and satisfy the smaller allocation.
7913  *
7914  * @empty_size     -    A hint that you plan on doing more COW. This is the
7915  *                      size in bytes the allocator should try to find free
7916  *                      next to the block it returns.  This is just a hint and
7917  *                      may be ignored by the allocator.
7918  *
7919  * @hint_byte      -    Hint to the allocator to start searching above the byte
7920  *                      address passed. It might be ignored.
7921  *
7922  * @ins            -    This key is modified to record the found hole. It will
7923  *                      have the following values:
7924  *                      ins->objectid == start position
7925  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
7926  *                      ins->offset == the size of the hole.
7927  *
7928  * @is_data        -    Boolean flag indicating whether an extent is
7929  *                      allocated for data (true) or metadata (false)
7930  *
7931  * @delalloc       -    Boolean flag indicating whether this allocation is for
7932  *                      delalloc or not. If 'true' data_rwsem of block groups
7933  *                      is going to be acquired.
7934  *
7935  *
7936  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
7937  * case -ENOSPC is returned then @ins->offset will contain the size of the
7938  * largest available hole the allocator managed to find.
7939  */
7940 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7941                          u64 num_bytes, u64 min_alloc_size,
7942                          u64 empty_size, u64 hint_byte,
7943                          struct btrfs_key *ins, int is_data, int delalloc)
7944 {
7945         struct btrfs_fs_info *fs_info = root->fs_info;
7946         bool final_tried = num_bytes == min_alloc_size;
7947         u64 flags;
7948         int ret;
7949
7950         flags = get_alloc_profile_by_root(root, is_data);
7951 again:
7952         WARN_ON(num_bytes < fs_info->sectorsize);
7953         ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
7954                                hint_byte, ins, flags, delalloc);
7955         if (!ret && !is_data) {
7956                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
7957         } else if (ret == -ENOSPC) {
7958                 if (!final_tried && ins->offset) {
7959                         num_bytes = min(num_bytes >> 1, ins->offset);
7960                         num_bytes = round_down(num_bytes,
7961                                                fs_info->sectorsize);
7962                         num_bytes = max(num_bytes, min_alloc_size);
7963                         ram_bytes = num_bytes;
7964                         if (num_bytes == min_alloc_size)
7965                                 final_tried = true;
7966                         goto again;
7967                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
7968                         struct btrfs_space_info *sinfo;
7969
7970                         sinfo = btrfs_find_space_info(fs_info, flags);
7971                         btrfs_err(fs_info,
7972                                   "allocation failed flags %llu, wanted %llu",
7973                                   flags, num_bytes);
7974                         if (sinfo)
7975                                 dump_space_info(fs_info, sinfo, num_bytes, 1);
7976                 }
7977         }
7978
7979         return ret;
7980 }
7981
7982 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7983                                         u64 start, u64 len,
7984                                         int pin, int delalloc)
7985 {
7986         struct btrfs_block_group_cache *cache;
7987         int ret = 0;
7988
7989         cache = btrfs_lookup_block_group(fs_info, start);
7990         if (!cache) {
7991                 btrfs_err(fs_info, "Unable to find block group for %llu",
7992                           start);
7993                 return -ENOSPC;
7994         }
7995
7996         if (pin)
7997                 pin_down_extent(cache, start, len, 1);
7998         else {
7999                 if (btrfs_test_opt(fs_info, DISCARD))
8000                         ret = btrfs_discard_extent(fs_info, start, len, NULL);
8001                 btrfs_add_free_space(cache, start, len);
8002                 btrfs_free_reserved_bytes(cache, len, delalloc);
8003                 trace_btrfs_reserved_extent_free(fs_info, start, len);
8004         }
8005
8006         btrfs_put_block_group(cache);
8007         return ret;
8008 }
8009
8010 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8011                                u64 start, u64 len, int delalloc)
8012 {
8013         return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
8014 }
8015
8016 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
8017                                        u64 start, u64 len)
8018 {
8019         return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
8020 }
8021
8022 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8023                                       u64 parent, u64 root_objectid,
8024                                       u64 flags, u64 owner, u64 offset,
8025                                       struct btrfs_key *ins, int ref_mod)
8026 {
8027         struct btrfs_fs_info *fs_info = trans->fs_info;
8028         int ret;
8029         struct btrfs_extent_item *extent_item;
8030         struct btrfs_extent_inline_ref *iref;
8031         struct btrfs_path *path;
8032         struct extent_buffer *leaf;
8033         int type;
8034         u32 size;
8035
8036         if (parent > 0)
8037                 type = BTRFS_SHARED_DATA_REF_KEY;
8038         else
8039                 type = BTRFS_EXTENT_DATA_REF_KEY;
8040
8041         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8042
8043         path = btrfs_alloc_path();
8044         if (!path)
8045                 return -ENOMEM;
8046
8047         path->leave_spinning = 1;
8048         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8049                                       ins, size);
8050         if (ret) {
8051                 btrfs_free_path(path);
8052                 return ret;
8053         }
8054
8055         leaf = path->nodes[0];
8056         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8057                                      struct btrfs_extent_item);
8058         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8059         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8060         btrfs_set_extent_flags(leaf, extent_item,
8061                                flags | BTRFS_EXTENT_FLAG_DATA);
8062
8063         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8064         btrfs_set_extent_inline_ref_type(leaf, iref, type);
8065         if (parent > 0) {
8066                 struct btrfs_shared_data_ref *ref;
8067                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
8068                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8069                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8070         } else {
8071                 struct btrfs_extent_data_ref *ref;
8072                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8073                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8074                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8075                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8076                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8077         }
8078
8079         btrfs_mark_buffer_dirty(path->nodes[0]);
8080         btrfs_free_path(path);
8081
8082         ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
8083         if (ret)
8084                 return ret;
8085
8086         ret = update_block_group(trans, ins->objectid, ins->offset, 1);
8087         if (ret) { /* -ENOENT, logic error */
8088                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8089                         ins->objectid, ins->offset);
8090                 BUG();
8091         }
8092         trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8093         return ret;
8094 }
8095
8096 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8097                                      struct btrfs_delayed_ref_node *node,
8098                                      struct btrfs_delayed_extent_op *extent_op)
8099 {
8100         struct btrfs_fs_info *fs_info = trans->fs_info;
8101         int ret;
8102         struct btrfs_extent_item *extent_item;
8103         struct btrfs_key extent_key;
8104         struct btrfs_tree_block_info *block_info;
8105         struct btrfs_extent_inline_ref *iref;
8106         struct btrfs_path *path;
8107         struct extent_buffer *leaf;
8108         struct btrfs_delayed_tree_ref *ref;
8109         u32 size = sizeof(*extent_item) + sizeof(*iref);
8110         u64 num_bytes;
8111         u64 flags = extent_op->flags_to_set;
8112         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8113
8114         ref = btrfs_delayed_node_to_tree_ref(node);
8115
8116         extent_key.objectid = node->bytenr;
8117         if (skinny_metadata) {
8118                 extent_key.offset = ref->level;
8119                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
8120                 num_bytes = fs_info->nodesize;
8121         } else {
8122                 extent_key.offset = node->num_bytes;
8123                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
8124                 size += sizeof(*block_info);
8125                 num_bytes = node->num_bytes;
8126         }
8127
8128         path = btrfs_alloc_path();
8129         if (!path)
8130                 return -ENOMEM;
8131
8132         path->leave_spinning = 1;
8133         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8134                                       &extent_key, size);
8135         if (ret) {
8136                 btrfs_free_path(path);
8137                 return ret;
8138         }
8139
8140         leaf = path->nodes[0];
8141         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8142                                      struct btrfs_extent_item);
8143         btrfs_set_extent_refs(leaf, extent_item, 1);
8144         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8145         btrfs_set_extent_flags(leaf, extent_item,
8146                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8147
8148         if (skinny_metadata) {
8149                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8150         } else {
8151                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8152                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
8153                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
8154                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8155         }
8156
8157         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
8158                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8159                 btrfs_set_extent_inline_ref_type(leaf, iref,
8160                                                  BTRFS_SHARED_BLOCK_REF_KEY);
8161                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
8162         } else {
8163                 btrfs_set_extent_inline_ref_type(leaf, iref,
8164                                                  BTRFS_TREE_BLOCK_REF_KEY);
8165                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
8166         }
8167
8168         btrfs_mark_buffer_dirty(leaf);
8169         btrfs_free_path(path);
8170
8171         ret = remove_from_free_space_tree(trans, extent_key.objectid,
8172                                           num_bytes);
8173         if (ret)
8174                 return ret;
8175
8176         ret = update_block_group(trans, extent_key.objectid,
8177                                  fs_info->nodesize, 1);
8178         if (ret) { /* -ENOENT, logic error */
8179                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8180                         extent_key.objectid, extent_key.offset);
8181                 BUG();
8182         }
8183
8184         trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
8185                                           fs_info->nodesize);
8186         return ret;
8187 }
8188
8189 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8190                                      struct btrfs_root *root, u64 owner,
8191                                      u64 offset, u64 ram_bytes,
8192                                      struct btrfs_key *ins)
8193 {
8194         struct btrfs_ref generic_ref = { 0 };
8195         int ret;
8196
8197         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8198
8199         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
8200                                ins->objectid, ins->offset, 0);
8201         btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
8202         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
8203         ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
8204                                          ram_bytes, NULL, NULL);
8205         return ret;
8206 }
8207
8208 /*
8209  * this is used by the tree logging recovery code.  It records that
8210  * an extent has been allocated and makes sure to clear the free
8211  * space cache bits as well
8212  */
8213 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8214                                    u64 root_objectid, u64 owner, u64 offset,
8215                                    struct btrfs_key *ins)
8216 {
8217         struct btrfs_fs_info *fs_info = trans->fs_info;
8218         int ret;
8219         struct btrfs_block_group_cache *block_group;
8220         struct btrfs_space_info *space_info;
8221
8222         /*
8223          * Mixed block groups will exclude before processing the log so we only
8224          * need to do the exclude dance if this fs isn't mixed.
8225          */
8226         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8227                 ret = __exclude_logged_extent(fs_info, ins->objectid,
8228                                               ins->offset);
8229                 if (ret)
8230                         return ret;
8231         }
8232
8233         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8234         if (!block_group)
8235                 return -EINVAL;
8236
8237         space_info = block_group->space_info;
8238         spin_lock(&space_info->lock);
8239         spin_lock(&block_group->lock);
8240         space_info->bytes_reserved += ins->offset;
8241         block_group->reserved += ins->offset;
8242         spin_unlock(&block_group->lock);
8243         spin_unlock(&space_info->lock);
8244
8245         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
8246                                          offset, ins, 1);
8247         btrfs_put_block_group(block_group);
8248         return ret;
8249 }
8250
8251 static struct extent_buffer *
8252 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8253                       u64 bytenr, int level, u64 owner)
8254 {
8255         struct btrfs_fs_info *fs_info = root->fs_info;
8256         struct extent_buffer *buf;
8257
8258         buf = btrfs_find_create_tree_block(fs_info, bytenr);
8259         if (IS_ERR(buf))
8260                 return buf;
8261
8262         /*
8263          * Extra safety check in case the extent tree is corrupted and extent
8264          * allocator chooses to use a tree block which is already used and
8265          * locked.
8266          */
8267         if (buf->lock_owner == current->pid) {
8268                 btrfs_err_rl(fs_info,
8269 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
8270                         buf->start, btrfs_header_owner(buf), current->pid);
8271                 free_extent_buffer(buf);
8272                 return ERR_PTR(-EUCLEAN);
8273         }
8274
8275         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8276         btrfs_tree_lock(buf);
8277         btrfs_clean_tree_block(buf);
8278         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8279
8280         btrfs_set_lock_blocking_write(buf);
8281         set_extent_buffer_uptodate(buf);
8282
8283         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
8284         btrfs_set_header_level(buf, level);
8285         btrfs_set_header_bytenr(buf, buf->start);
8286         btrfs_set_header_generation(buf, trans->transid);
8287         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
8288         btrfs_set_header_owner(buf, owner);
8289         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
8290         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
8291         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8292                 buf->log_index = root->log_transid % 2;
8293                 /*
8294                  * we allow two log transactions at a time, use different
8295                  * EXTENT bit to differentiate dirty pages.
8296                  */
8297                 if (buf->log_index == 0)
8298                         set_extent_dirty(&root->dirty_log_pages, buf->start,
8299                                         buf->start + buf->len - 1, GFP_NOFS);
8300                 else
8301                         set_extent_new(&root->dirty_log_pages, buf->start,
8302                                         buf->start + buf->len - 1);
8303         } else {
8304                 buf->log_index = -1;
8305                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8306                          buf->start + buf->len - 1, GFP_NOFS);
8307         }
8308         trans->dirty = true;
8309         /* this returns a buffer locked for blocking */
8310         return buf;
8311 }
8312
8313 static struct btrfs_block_rsv *
8314 use_block_rsv(struct btrfs_trans_handle *trans,
8315               struct btrfs_root *root, u32 blocksize)
8316 {
8317         struct btrfs_fs_info *fs_info = root->fs_info;
8318         struct btrfs_block_rsv *block_rsv;
8319         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8320         int ret;
8321         bool global_updated = false;
8322
8323         block_rsv = get_block_rsv(trans, root);
8324
8325         if (unlikely(block_rsv->size == 0))
8326                 goto try_reserve;
8327 again:
8328         ret = block_rsv_use_bytes(block_rsv, blocksize);
8329         if (!ret)
8330                 return block_rsv;
8331
8332         if (block_rsv->failfast)
8333                 return ERR_PTR(ret);
8334
8335         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8336                 global_updated = true;
8337                 update_global_block_rsv(fs_info);
8338                 goto again;
8339         }
8340
8341         /*
8342          * The global reserve still exists to save us from ourselves, so don't
8343          * warn_on if we are short on our delayed refs reserve.
8344          */
8345         if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
8346             btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8347                 static DEFINE_RATELIMIT_STATE(_rs,
8348                                 DEFAULT_RATELIMIT_INTERVAL * 10,
8349                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
8350                 if (__ratelimit(&_rs))
8351                         WARN(1, KERN_DEBUG
8352                                 "BTRFS: block rsv returned %d\n", ret);
8353         }
8354 try_reserve:
8355         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8356                                      BTRFS_RESERVE_NO_FLUSH);
8357         if (!ret)
8358                 return block_rsv;
8359         /*
8360          * If we couldn't reserve metadata bytes try and use some from
8361          * the global reserve if its space type is the same as the global
8362          * reservation.
8363          */
8364         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8365             block_rsv->space_info == global_rsv->space_info) {
8366                 ret = block_rsv_use_bytes(global_rsv, blocksize);
8367                 if (!ret)
8368                         return global_rsv;
8369         }
8370         return ERR_PTR(ret);
8371 }
8372
8373 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8374                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
8375 {
8376         block_rsv_add_bytes(block_rsv, blocksize, false);
8377         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8378 }
8379
8380 /*
8381  * finds a free extent and does all the dirty work required for allocation
8382  * returns the tree buffer or an ERR_PTR on error.
8383  */
8384 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8385                                              struct btrfs_root *root,
8386                                              u64 parent, u64 root_objectid,
8387                                              const struct btrfs_disk_key *key,
8388                                              int level, u64 hint,
8389                                              u64 empty_size)
8390 {
8391         struct btrfs_fs_info *fs_info = root->fs_info;
8392         struct btrfs_key ins;
8393         struct btrfs_block_rsv *block_rsv;
8394         struct extent_buffer *buf;
8395         struct btrfs_delayed_extent_op *extent_op;
8396         struct btrfs_ref generic_ref = { 0 };
8397         u64 flags = 0;
8398         int ret;
8399         u32 blocksize = fs_info->nodesize;
8400         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8401
8402 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8403         if (btrfs_is_testing(fs_info)) {
8404                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8405                                             level, root_objectid);
8406                 if (!IS_ERR(buf))
8407                         root->alloc_bytenr += blocksize;
8408                 return buf;
8409         }
8410 #endif
8411
8412         block_rsv = use_block_rsv(trans, root, blocksize);
8413         if (IS_ERR(block_rsv))
8414                 return ERR_CAST(block_rsv);
8415
8416         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8417                                    empty_size, hint, &ins, 0, 0);
8418         if (ret)
8419                 goto out_unuse;
8420
8421         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8422                                     root_objectid);
8423         if (IS_ERR(buf)) {
8424                 ret = PTR_ERR(buf);
8425                 goto out_free_reserved;
8426         }
8427
8428         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8429                 if (parent == 0)
8430                         parent = ins.objectid;
8431                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8432         } else
8433                 BUG_ON(parent > 0);
8434
8435         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8436                 extent_op = btrfs_alloc_delayed_extent_op();
8437                 if (!extent_op) {
8438                         ret = -ENOMEM;
8439                         goto out_free_buf;
8440                 }
8441                 if (key)
8442                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8443                 else
8444                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8445                 extent_op->flags_to_set = flags;
8446                 extent_op->update_key = skinny_metadata ? false : true;
8447                 extent_op->update_flags = true;
8448                 extent_op->is_data = false;
8449                 extent_op->level = level;
8450
8451                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
8452                                        ins.objectid, ins.offset, parent);
8453                 generic_ref.real_root = root->root_key.objectid;
8454                 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
8455                 btrfs_ref_tree_mod(fs_info, &generic_ref);
8456                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
8457                                                  extent_op, NULL, NULL);
8458                 if (ret)
8459                         goto out_free_delayed;
8460         }
8461         return buf;
8462
8463 out_free_delayed:
8464         btrfs_free_delayed_extent_op(extent_op);
8465 out_free_buf:
8466         free_extent_buffer(buf);
8467 out_free_reserved:
8468         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8469 out_unuse:
8470         unuse_block_rsv(fs_info, block_rsv, blocksize);
8471         return ERR_PTR(ret);
8472 }
8473
8474 struct walk_control {
8475         u64 refs[BTRFS_MAX_LEVEL];
8476         u64 flags[BTRFS_MAX_LEVEL];
8477         struct btrfs_key update_progress;
8478         struct btrfs_key drop_progress;
8479         int drop_level;
8480         int stage;
8481         int level;
8482         int shared_level;
8483         int update_ref;
8484         int keep_locks;
8485         int reada_slot;
8486         int reada_count;
8487         int restarted;
8488 };
8489
8490 #define DROP_REFERENCE  1
8491 #define UPDATE_BACKREF  2
8492
8493 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8494                                      struct btrfs_root *root,
8495                                      struct walk_control *wc,
8496                                      struct btrfs_path *path)
8497 {
8498         struct btrfs_fs_info *fs_info = root->fs_info;
8499         u64 bytenr;
8500         u64 generation;
8501         u64 refs;
8502         u64 flags;
8503         u32 nritems;
8504         struct btrfs_key key;
8505         struct extent_buffer *eb;
8506         int ret;
8507         int slot;
8508         int nread = 0;
8509
8510         if (path->slots[wc->level] < wc->reada_slot) {
8511                 wc->reada_count = wc->reada_count * 2 / 3;
8512                 wc->reada_count = max(wc->reada_count, 2);
8513         } else {
8514                 wc->reada_count = wc->reada_count * 3 / 2;
8515                 wc->reada_count = min_t(int, wc->reada_count,
8516                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8517         }
8518
8519         eb = path->nodes[wc->level];
8520         nritems = btrfs_header_nritems(eb);
8521
8522         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8523                 if (nread >= wc->reada_count)
8524                         break;
8525
8526                 cond_resched();
8527                 bytenr = btrfs_node_blockptr(eb, slot);
8528                 generation = btrfs_node_ptr_generation(eb, slot);
8529
8530                 if (slot == path->slots[wc->level])
8531                         goto reada;
8532
8533                 if (wc->stage == UPDATE_BACKREF &&
8534                     generation <= root->root_key.offset)
8535                         continue;
8536
8537                 /* We don't lock the tree block, it's OK to be racy here */
8538                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8539                                                wc->level - 1, 1, &refs,
8540                                                &flags);
8541                 /* We don't care about errors in readahead. */
8542                 if (ret < 0)
8543                         continue;
8544                 BUG_ON(refs == 0);
8545
8546                 if (wc->stage == DROP_REFERENCE) {
8547                         if (refs == 1)
8548                                 goto reada;
8549
8550                         if (wc->level == 1 &&
8551                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8552                                 continue;
8553                         if (!wc->update_ref ||
8554                             generation <= root->root_key.offset)
8555                                 continue;
8556                         btrfs_node_key_to_cpu(eb, &key, slot);
8557                         ret = btrfs_comp_cpu_keys(&key,
8558                                                   &wc->update_progress);
8559                         if (ret < 0)
8560                                 continue;
8561                 } else {
8562                         if (wc->level == 1 &&
8563                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8564                                 continue;
8565                 }
8566 reada:
8567                 readahead_tree_block(fs_info, bytenr);
8568                 nread++;
8569         }
8570         wc->reada_slot = slot;
8571 }
8572
8573 /*
8574  * helper to process tree block while walking down the tree.
8575  *
8576  * when wc->stage == UPDATE_BACKREF, this function updates
8577  * back refs for pointers in the block.
8578  *
8579  * NOTE: return value 1 means we should stop walking down.
8580  */
8581 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8582                                    struct btrfs_root *root,
8583                                    struct btrfs_path *path,
8584                                    struct walk_control *wc, int lookup_info)
8585 {
8586         struct btrfs_fs_info *fs_info = root->fs_info;
8587         int level = wc->level;
8588         struct extent_buffer *eb = path->nodes[level];
8589         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8590         int ret;
8591
8592         if (wc->stage == UPDATE_BACKREF &&
8593             btrfs_header_owner(eb) != root->root_key.objectid)
8594                 return 1;
8595
8596         /*
8597          * when reference count of tree block is 1, it won't increase
8598          * again. once full backref flag is set, we never clear it.
8599          */
8600         if (lookup_info &&
8601             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8602              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8603                 BUG_ON(!path->locks[level]);
8604                 ret = btrfs_lookup_extent_info(trans, fs_info,
8605                                                eb->start, level, 1,
8606                                                &wc->refs[level],
8607                                                &wc->flags[level]);
8608                 BUG_ON(ret == -ENOMEM);
8609                 if (ret)
8610                         return ret;
8611                 BUG_ON(wc->refs[level] == 0);
8612         }
8613
8614         if (wc->stage == DROP_REFERENCE) {
8615                 if (wc->refs[level] > 1)
8616                         return 1;
8617
8618                 if (path->locks[level] && !wc->keep_locks) {
8619                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8620                         path->locks[level] = 0;
8621                 }
8622                 return 0;
8623         }
8624
8625         /* wc->stage == UPDATE_BACKREF */
8626         if (!(wc->flags[level] & flag)) {
8627                 BUG_ON(!path->locks[level]);
8628                 ret = btrfs_inc_ref(trans, root, eb, 1);
8629                 BUG_ON(ret); /* -ENOMEM */
8630                 ret = btrfs_dec_ref(trans, root, eb, 0);
8631                 BUG_ON(ret); /* -ENOMEM */
8632                 ret = btrfs_set_disk_extent_flags(trans, eb->start,
8633                                                   eb->len, flag,
8634                                                   btrfs_header_level(eb), 0);
8635                 BUG_ON(ret); /* -ENOMEM */
8636                 wc->flags[level] |= flag;
8637         }
8638
8639         /*
8640          * the block is shared by multiple trees, so it's not good to
8641          * keep the tree lock
8642          */
8643         if (path->locks[level] && level > 0) {
8644                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8645                 path->locks[level] = 0;
8646         }
8647         return 0;
8648 }
8649
8650 /*
8651  * This is used to verify a ref exists for this root to deal with a bug where we
8652  * would have a drop_progress key that hadn't been updated properly.
8653  */
8654 static int check_ref_exists(struct btrfs_trans_handle *trans,
8655                             struct btrfs_root *root, u64 bytenr, u64 parent,
8656                             int level)
8657 {
8658         struct btrfs_path *path;
8659         struct btrfs_extent_inline_ref *iref;
8660         int ret;
8661
8662         path = btrfs_alloc_path();
8663         if (!path)
8664                 return -ENOMEM;
8665
8666         ret = lookup_extent_backref(trans, path, &iref, bytenr,
8667                                     root->fs_info->nodesize, parent,
8668                                     root->root_key.objectid, level, 0);
8669         btrfs_free_path(path);
8670         if (ret == -ENOENT)
8671                 return 0;
8672         if (ret < 0)
8673                 return ret;
8674         return 1;
8675 }
8676
8677 /*
8678  * helper to process tree block pointer.
8679  *
8680  * when wc->stage == DROP_REFERENCE, this function checks
8681  * reference count of the block pointed to. if the block
8682  * is shared and we need update back refs for the subtree
8683  * rooted at the block, this function changes wc->stage to
8684  * UPDATE_BACKREF. if the block is shared and there is no
8685  * need to update back, this function drops the reference
8686  * to the block.
8687  *
8688  * NOTE: return value 1 means we should stop walking down.
8689  */
8690 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8691                                  struct btrfs_root *root,
8692                                  struct btrfs_path *path,
8693                                  struct walk_control *wc, int *lookup_info)
8694 {
8695         struct btrfs_fs_info *fs_info = root->fs_info;
8696         u64 bytenr;
8697         u64 generation;
8698         u64 parent;
8699         struct btrfs_key key;
8700         struct btrfs_key first_key;
8701         struct btrfs_ref ref = { 0 };
8702         struct extent_buffer *next;
8703         int level = wc->level;
8704         int reada = 0;
8705         int ret = 0;
8706         bool need_account = false;
8707
8708         generation = btrfs_node_ptr_generation(path->nodes[level],
8709                                                path->slots[level]);
8710         /*
8711          * if the lower level block was created before the snapshot
8712          * was created, we know there is no need to update back refs
8713          * for the subtree
8714          */
8715         if (wc->stage == UPDATE_BACKREF &&
8716             generation <= root->root_key.offset) {
8717                 *lookup_info = 1;
8718                 return 1;
8719         }
8720
8721         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8722         btrfs_node_key_to_cpu(path->nodes[level], &first_key,
8723                               path->slots[level]);
8724
8725         next = find_extent_buffer(fs_info, bytenr);
8726         if (!next) {
8727                 next = btrfs_find_create_tree_block(fs_info, bytenr);
8728                 if (IS_ERR(next))
8729                         return PTR_ERR(next);
8730
8731                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8732                                                level - 1);
8733                 reada = 1;
8734         }
8735         btrfs_tree_lock(next);
8736         btrfs_set_lock_blocking_write(next);
8737
8738         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8739                                        &wc->refs[level - 1],
8740                                        &wc->flags[level - 1]);
8741         if (ret < 0)
8742                 goto out_unlock;
8743
8744         if (unlikely(wc->refs[level - 1] == 0)) {
8745                 btrfs_err(fs_info, "Missing references.");
8746                 ret = -EIO;
8747                 goto out_unlock;
8748         }
8749         *lookup_info = 0;
8750
8751         if (wc->stage == DROP_REFERENCE) {
8752                 if (wc->refs[level - 1] > 1) {
8753                         need_account = true;
8754                         if (level == 1 &&
8755                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8756                                 goto skip;
8757
8758                         if (!wc->update_ref ||
8759                             generation <= root->root_key.offset)
8760                                 goto skip;
8761
8762                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8763                                               path->slots[level]);
8764                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8765                         if (ret < 0)
8766                                 goto skip;
8767
8768                         wc->stage = UPDATE_BACKREF;
8769                         wc->shared_level = level - 1;
8770                 }
8771         } else {
8772                 if (level == 1 &&
8773                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8774                         goto skip;
8775         }
8776
8777         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8778                 btrfs_tree_unlock(next);
8779                 free_extent_buffer(next);
8780                 next = NULL;
8781                 *lookup_info = 1;
8782         }
8783
8784         if (!next) {
8785                 if (reada && level == 1)
8786                         reada_walk_down(trans, root, wc, path);
8787                 next = read_tree_block(fs_info, bytenr, generation, level - 1,
8788                                        &first_key);
8789                 if (IS_ERR(next)) {
8790                         return PTR_ERR(next);
8791                 } else if (!extent_buffer_uptodate(next)) {
8792                         free_extent_buffer(next);
8793                         return -EIO;
8794                 }
8795                 btrfs_tree_lock(next);
8796                 btrfs_set_lock_blocking_write(next);
8797         }
8798
8799         level--;
8800         ASSERT(level == btrfs_header_level(next));
8801         if (level != btrfs_header_level(next)) {
8802                 btrfs_err(root->fs_info, "mismatched level");
8803                 ret = -EIO;
8804                 goto out_unlock;
8805         }
8806         path->nodes[level] = next;
8807         path->slots[level] = 0;
8808         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8809         wc->level = level;
8810         if (wc->level == 1)
8811                 wc->reada_slot = 0;
8812         return 0;
8813 skip:
8814         wc->refs[level - 1] = 0;
8815         wc->flags[level - 1] = 0;
8816         if (wc->stage == DROP_REFERENCE) {
8817                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8818                         parent = path->nodes[level]->start;
8819                 } else {
8820                         ASSERT(root->root_key.objectid ==
8821                                btrfs_header_owner(path->nodes[level]));
8822                         if (root->root_key.objectid !=
8823                             btrfs_header_owner(path->nodes[level])) {
8824                                 btrfs_err(root->fs_info,
8825                                                 "mismatched block owner");
8826                                 ret = -EIO;
8827                                 goto out_unlock;
8828                         }
8829                         parent = 0;
8830                 }
8831
8832                 /*
8833                  * If we had a drop_progress we need to verify the refs are set
8834                  * as expected.  If we find our ref then we know that from here
8835                  * on out everything should be correct, and we can clear the
8836                  * ->restarted flag.
8837                  */
8838                 if (wc->restarted) {
8839                         ret = check_ref_exists(trans, root, bytenr, parent,
8840                                                level - 1);
8841                         if (ret < 0)
8842                                 goto out_unlock;
8843                         if (ret == 0)
8844                                 goto no_delete;
8845                         ret = 0;
8846                         wc->restarted = 0;
8847                 }
8848
8849                 /*
8850                  * Reloc tree doesn't contribute to qgroup numbers, and we have
8851                  * already accounted them at merge time (replace_path),
8852                  * thus we could skip expensive subtree trace here.
8853                  */
8854                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
8855                     need_account) {
8856                         ret = btrfs_qgroup_trace_subtree(trans, next,
8857                                                          generation, level - 1);
8858                         if (ret) {
8859                                 btrfs_err_rl(fs_info,
8860                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8861                                              ret);
8862                         }
8863                 }
8864
8865                 /*
8866                  * We need to update the next key in our walk control so we can
8867                  * update the drop_progress key accordingly.  We don't care if
8868                  * find_next_key doesn't find a key because that means we're at
8869                  * the end and are going to clean up now.
8870                  */
8871                 wc->drop_level = level;
8872                 find_next_key(path, level, &wc->drop_progress);
8873
8874                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
8875                                        fs_info->nodesize, parent);
8876                 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
8877                 ret = btrfs_free_extent(trans, &ref);
8878                 if (ret)
8879                         goto out_unlock;
8880         }
8881 no_delete:
8882         *lookup_info = 1;
8883         ret = 1;
8884
8885 out_unlock:
8886         btrfs_tree_unlock(next);
8887         free_extent_buffer(next);
8888
8889         return ret;
8890 }
8891
8892 /*
8893  * helper to process tree block while walking up the tree.
8894  *
8895  * when wc->stage == DROP_REFERENCE, this function drops
8896  * reference count on the block.
8897  *
8898  * when wc->stage == UPDATE_BACKREF, this function changes
8899  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8900  * to UPDATE_BACKREF previously while processing the block.
8901  *
8902  * NOTE: return value 1 means we should stop walking up.
8903  */
8904 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8905                                  struct btrfs_root *root,
8906                                  struct btrfs_path *path,
8907                                  struct walk_control *wc)
8908 {
8909         struct btrfs_fs_info *fs_info = root->fs_info;
8910         int ret;
8911         int level = wc->level;
8912         struct extent_buffer *eb = path->nodes[level];
8913         u64 parent = 0;
8914
8915         if (wc->stage == UPDATE_BACKREF) {
8916                 BUG_ON(wc->shared_level < level);
8917                 if (level < wc->shared_level)
8918                         goto out;
8919
8920                 ret = find_next_key(path, level + 1, &wc->update_progress);
8921                 if (ret > 0)
8922                         wc->update_ref = 0;
8923
8924                 wc->stage = DROP_REFERENCE;
8925                 wc->shared_level = -1;
8926                 path->slots[level] = 0;
8927
8928                 /*
8929                  * check reference count again if the block isn't locked.
8930                  * we should start walking down the tree again if reference
8931                  * count is one.
8932                  */
8933                 if (!path->locks[level]) {
8934                         BUG_ON(level == 0);
8935                         btrfs_tree_lock(eb);
8936                         btrfs_set_lock_blocking_write(eb);
8937                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8938
8939                         ret = btrfs_lookup_extent_info(trans, fs_info,
8940                                                        eb->start, level, 1,
8941                                                        &wc->refs[level],
8942                                                        &wc->flags[level]);
8943                         if (ret < 0) {
8944                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8945                                 path->locks[level] = 0;
8946                                 return ret;
8947                         }
8948                         BUG_ON(wc->refs[level] == 0);
8949                         if (wc->refs[level] == 1) {
8950                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8951                                 path->locks[level] = 0;
8952                                 return 1;
8953                         }
8954                 }
8955         }
8956
8957         /* wc->stage == DROP_REFERENCE */
8958         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8959
8960         if (wc->refs[level] == 1) {
8961                 if (level == 0) {
8962                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8963                                 ret = btrfs_dec_ref(trans, root, eb, 1);
8964                         else
8965                                 ret = btrfs_dec_ref(trans, root, eb, 0);
8966                         BUG_ON(ret); /* -ENOMEM */
8967                         if (is_fstree(root->root_key.objectid)) {
8968                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
8969                                 if (ret) {
8970                                         btrfs_err_rl(fs_info,
8971         "error %d accounting leaf items, quota is out of sync, rescan required",
8972                                              ret);
8973                                 }
8974                         }
8975                 }
8976                 /* make block locked assertion in btrfs_clean_tree_block happy */
8977                 if (!path->locks[level] &&
8978                     btrfs_header_generation(eb) == trans->transid) {
8979                         btrfs_tree_lock(eb);
8980                         btrfs_set_lock_blocking_write(eb);
8981                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8982                 }
8983                 btrfs_clean_tree_block(eb);
8984         }
8985
8986         if (eb == root->node) {
8987                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8988                         parent = eb->start;
8989                 else if (root->root_key.objectid != btrfs_header_owner(eb))
8990                         goto owner_mismatch;
8991         } else {
8992                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8993                         parent = path->nodes[level + 1]->start;
8994                 else if (root->root_key.objectid !=
8995                          btrfs_header_owner(path->nodes[level + 1]))
8996                         goto owner_mismatch;
8997         }
8998
8999         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9000 out:
9001         wc->refs[level] = 0;
9002         wc->flags[level] = 0;
9003         return 0;
9004
9005 owner_mismatch:
9006         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
9007                      btrfs_header_owner(eb), root->root_key.objectid);
9008         return -EUCLEAN;
9009 }
9010
9011 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9012                                    struct btrfs_root *root,
9013                                    struct btrfs_path *path,
9014                                    struct walk_control *wc)
9015 {
9016         int level = wc->level;
9017         int lookup_info = 1;
9018         int ret;
9019
9020         while (level >= 0) {
9021                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
9022                 if (ret > 0)
9023                         break;
9024
9025                 if (level == 0)
9026                         break;
9027
9028                 if (path->slots[level] >=
9029                     btrfs_header_nritems(path->nodes[level]))
9030                         break;
9031
9032                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
9033                 if (ret > 0) {
9034                         path->slots[level]++;
9035                         continue;
9036                 } else if (ret < 0)
9037                         return ret;
9038                 level = wc->level;
9039         }
9040         return 0;
9041 }
9042
9043 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9044                                  struct btrfs_root *root,
9045                                  struct btrfs_path *path,
9046                                  struct walk_control *wc, int max_level)
9047 {
9048         int level = wc->level;
9049         int ret;
9050
9051         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9052         while (level < max_level && path->nodes[level]) {
9053                 wc->level = level;
9054                 if (path->slots[level] + 1 <
9055                     btrfs_header_nritems(path->nodes[level])) {
9056                         path->slots[level]++;
9057                         return 0;
9058                 } else {
9059                         ret = walk_up_proc(trans, root, path, wc);
9060                         if (ret > 0)
9061                                 return 0;
9062                         if (ret < 0)
9063                                 return ret;
9064
9065                         if (path->locks[level]) {
9066                                 btrfs_tree_unlock_rw(path->nodes[level],
9067                                                      path->locks[level]);
9068                                 path->locks[level] = 0;
9069                         }
9070                         free_extent_buffer(path->nodes[level]);
9071                         path->nodes[level] = NULL;
9072                         level++;
9073                 }
9074         }
9075         return 1;
9076 }
9077
9078 /*
9079  * drop a subvolume tree.
9080  *
9081  * this function traverses the tree freeing any blocks that only
9082  * referenced by the tree.
9083  *
9084  * when a shared tree block is found. this function decreases its
9085  * reference count by one. if update_ref is true, this function
9086  * also make sure backrefs for the shared block and all lower level
9087  * blocks are properly updated.
9088  *
9089  * If called with for_reloc == 0, may exit early with -EAGAIN
9090  */
9091 int btrfs_drop_snapshot(struct btrfs_root *root,
9092                          struct btrfs_block_rsv *block_rsv, int update_ref,
9093                          int for_reloc)
9094 {
9095         struct btrfs_fs_info *fs_info = root->fs_info;
9096         struct btrfs_path *path;
9097         struct btrfs_trans_handle *trans;
9098         struct btrfs_root *tree_root = fs_info->tree_root;
9099         struct btrfs_root_item *root_item = &root->root_item;
9100         struct walk_control *wc;
9101         struct btrfs_key key;
9102         int err = 0;
9103         int ret;
9104         int level;
9105         bool root_dropped = false;
9106
9107         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
9108
9109         path = btrfs_alloc_path();
9110         if (!path) {
9111                 err = -ENOMEM;
9112                 goto out;
9113         }
9114
9115         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9116         if (!wc) {
9117                 btrfs_free_path(path);
9118                 err = -ENOMEM;
9119                 goto out;
9120         }
9121
9122         trans = btrfs_start_transaction(tree_root, 0);
9123         if (IS_ERR(trans)) {
9124                 err = PTR_ERR(trans);
9125                 goto out_free;
9126         }
9127
9128         err = btrfs_run_delayed_items(trans);
9129         if (err)
9130                 goto out_end_trans;
9131
9132         if (block_rsv)
9133                 trans->block_rsv = block_rsv;
9134
9135         /*
9136          * This will help us catch people modifying the fs tree while we're
9137          * dropping it.  It is unsafe to mess with the fs tree while it's being
9138          * dropped as we unlock the root node and parent nodes as we walk down
9139          * the tree, assuming nothing will change.  If something does change
9140          * then we'll have stale information and drop references to blocks we've
9141          * already dropped.
9142          */
9143         set_bit(BTRFS_ROOT_DELETING, &root->state);
9144         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9145                 level = btrfs_header_level(root->node);
9146                 path->nodes[level] = btrfs_lock_root_node(root);
9147                 btrfs_set_lock_blocking_write(path->nodes[level]);
9148                 path->slots[level] = 0;
9149                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9150                 memset(&wc->update_progress, 0,
9151                        sizeof(wc->update_progress));
9152         } else {
9153                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9154                 memcpy(&wc->update_progress, &key,
9155                        sizeof(wc->update_progress));
9156
9157                 level = root_item->drop_level;
9158                 BUG_ON(level == 0);
9159                 path->lowest_level = level;
9160                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9161                 path->lowest_level = 0;
9162                 if (ret < 0) {
9163                         err = ret;
9164                         goto out_end_trans;
9165                 }
9166                 WARN_ON(ret > 0);
9167
9168                 /*
9169                  * unlock our path, this is safe because only this
9170                  * function is allowed to delete this snapshot
9171                  */
9172                 btrfs_unlock_up_safe(path, 0);
9173
9174                 level = btrfs_header_level(root->node);
9175                 while (1) {
9176                         btrfs_tree_lock(path->nodes[level]);
9177                         btrfs_set_lock_blocking_write(path->nodes[level]);
9178                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9179
9180                         ret = btrfs_lookup_extent_info(trans, fs_info,
9181                                                 path->nodes[level]->start,
9182                                                 level, 1, &wc->refs[level],
9183                                                 &wc->flags[level]);
9184                         if (ret < 0) {
9185                                 err = ret;
9186                                 goto out_end_trans;
9187                         }
9188                         BUG_ON(wc->refs[level] == 0);
9189
9190                         if (level == root_item->drop_level)
9191                                 break;
9192
9193                         btrfs_tree_unlock(path->nodes[level]);
9194                         path->locks[level] = 0;
9195                         WARN_ON(wc->refs[level] != 1);
9196                         level--;
9197                 }
9198         }
9199
9200         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
9201         wc->level = level;
9202         wc->shared_level = -1;
9203         wc->stage = DROP_REFERENCE;
9204         wc->update_ref = update_ref;
9205         wc->keep_locks = 0;
9206         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9207
9208         while (1) {
9209
9210                 ret = walk_down_tree(trans, root, path, wc);
9211                 if (ret < 0) {
9212                         err = ret;
9213                         break;
9214                 }
9215
9216                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9217                 if (ret < 0) {
9218                         err = ret;
9219                         break;
9220                 }
9221
9222                 if (ret > 0) {
9223                         BUG_ON(wc->stage != DROP_REFERENCE);
9224                         break;
9225                 }
9226
9227                 if (wc->stage == DROP_REFERENCE) {
9228                         wc->drop_level = wc->level;
9229                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
9230                                               &wc->drop_progress,
9231                                               path->slots[wc->drop_level]);
9232                 }
9233                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
9234                                       &wc->drop_progress);
9235                 root_item->drop_level = wc->drop_level;
9236
9237                 BUG_ON(wc->level == 0);
9238                 if (btrfs_should_end_transaction(trans) ||
9239                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9240                         ret = btrfs_update_root(trans, tree_root,
9241                                                 &root->root_key,
9242                                                 root_item);
9243                         if (ret) {
9244                                 btrfs_abort_transaction(trans, ret);
9245                                 err = ret;
9246                                 goto out_end_trans;
9247                         }
9248
9249                         btrfs_end_transaction_throttle(trans);
9250                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9251                                 btrfs_debug(fs_info,
9252                                             "drop snapshot early exit");
9253                                 err = -EAGAIN;
9254                                 goto out_free;
9255                         }
9256
9257                         trans = btrfs_start_transaction(tree_root, 0);
9258                         if (IS_ERR(trans)) {
9259                                 err = PTR_ERR(trans);
9260                                 goto out_free;
9261                         }
9262                         if (block_rsv)
9263                                 trans->block_rsv = block_rsv;
9264                 }
9265         }
9266         btrfs_release_path(path);
9267         if (err)
9268                 goto out_end_trans;
9269
9270         ret = btrfs_del_root(trans, &root->root_key);
9271         if (ret) {
9272                 btrfs_abort_transaction(trans, ret);
9273                 err = ret;
9274                 goto out_end_trans;
9275         }
9276
9277         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9278                 ret = btrfs_find_root(tree_root, &root->root_key, path,
9279                                       NULL, NULL);
9280                 if (ret < 0) {
9281                         btrfs_abort_transaction(trans, ret);
9282                         err = ret;
9283                         goto out_end_trans;
9284                 } else if (ret > 0) {
9285                         /* if we fail to delete the orphan item this time
9286                          * around, it'll get picked up the next time.
9287                          *
9288                          * The most common failure here is just -ENOENT.
9289                          */
9290                         btrfs_del_orphan_item(trans, tree_root,
9291                                               root->root_key.objectid);
9292                 }
9293         }
9294
9295         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9296                 btrfs_add_dropped_root(trans, root);
9297         } else {
9298                 free_extent_buffer(root->node);
9299                 free_extent_buffer(root->commit_root);
9300                 btrfs_put_fs_root(root);
9301         }
9302         root_dropped = true;
9303 out_end_trans:
9304         btrfs_end_transaction_throttle(trans);
9305 out_free:
9306         kfree(wc);
9307         btrfs_free_path(path);
9308 out:
9309         /*
9310          * So if we need to stop dropping the snapshot for whatever reason we
9311          * need to make sure to add it back to the dead root list so that we
9312          * keep trying to do the work later.  This also cleans up roots if we
9313          * don't have it in the radix (like when we recover after a power fail
9314          * or unmount) so we don't leak memory.
9315          */
9316         if (!for_reloc && !root_dropped)
9317                 btrfs_add_dead_root(root);
9318         if (err && err != -EAGAIN)
9319                 btrfs_handle_fs_error(fs_info, err, NULL);
9320         return err;
9321 }
9322
9323 /*
9324  * drop subtree rooted at tree block 'node'.
9325  *
9326  * NOTE: this function will unlock and release tree block 'node'
9327  * only used by relocation code
9328  */
9329 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9330                         struct btrfs_root *root,
9331                         struct extent_buffer *node,
9332                         struct extent_buffer *parent)
9333 {
9334         struct btrfs_fs_info *fs_info = root->fs_info;
9335         struct btrfs_path *path;
9336         struct walk_control *wc;
9337         int level;
9338         int parent_level;
9339         int ret = 0;
9340         int wret;
9341
9342         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9343
9344         path = btrfs_alloc_path();
9345         if (!path)
9346                 return -ENOMEM;
9347
9348         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9349         if (!wc) {
9350                 btrfs_free_path(path);
9351                 return -ENOMEM;
9352         }
9353
9354         btrfs_assert_tree_locked(parent);
9355         parent_level = btrfs_header_level(parent);
9356         extent_buffer_get(parent);
9357         path->nodes[parent_level] = parent;
9358         path->slots[parent_level] = btrfs_header_nritems(parent);
9359
9360         btrfs_assert_tree_locked(node);
9361         level = btrfs_header_level(node);
9362         path->nodes[level] = node;
9363         path->slots[level] = 0;
9364         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9365
9366         wc->refs[parent_level] = 1;
9367         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9368         wc->level = level;
9369         wc->shared_level = -1;
9370         wc->stage = DROP_REFERENCE;
9371         wc->update_ref = 0;
9372         wc->keep_locks = 1;
9373         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9374
9375         while (1) {
9376                 wret = walk_down_tree(trans, root, path, wc);
9377                 if (wret < 0) {
9378                         ret = wret;
9379                         break;
9380                 }
9381
9382                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9383                 if (wret < 0)
9384                         ret = wret;
9385                 if (wret != 0)
9386                         break;
9387         }
9388
9389         kfree(wc);
9390         btrfs_free_path(path);
9391         return ret;
9392 }
9393
9394 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9395 {
9396         u64 num_devices;
9397         u64 stripped;
9398
9399         /*
9400          * if restripe for this chunk_type is on pick target profile and
9401          * return, otherwise do the usual balance
9402          */
9403         stripped = get_restripe_target(fs_info, flags);
9404         if (stripped)
9405                 return extended_to_chunk(stripped);
9406
9407         num_devices = fs_info->fs_devices->rw_devices;
9408
9409         stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
9410                 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
9411
9412         if (num_devices == 1) {
9413                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9414                 stripped = flags & ~stripped;
9415
9416                 /* turn raid0 into single device chunks */
9417                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9418                         return stripped;
9419
9420                 /* turn mirroring into duplication */
9421                 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
9422                              BTRFS_BLOCK_GROUP_RAID10))
9423                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9424         } else {
9425                 /* they already had raid on here, just return */
9426                 if (flags & stripped)
9427                         return flags;
9428
9429                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9430                 stripped = flags & ~stripped;
9431
9432                 /* switch duplicated blocks with raid1 */
9433                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9434                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9435
9436                 /* this is drive concat, leave it alone */
9437         }
9438
9439         return flags;
9440 }
9441
9442 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9443 {
9444         struct btrfs_space_info *sinfo = cache->space_info;
9445         u64 num_bytes;
9446         u64 sinfo_used;
9447         u64 min_allocable_bytes;
9448         int ret = -ENOSPC;
9449
9450         /*
9451          * We need some metadata space and system metadata space for
9452          * allocating chunks in some corner cases until we force to set
9453          * it to be readonly.
9454          */
9455         if ((sinfo->flags &
9456              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9457             !force)
9458                 min_allocable_bytes = SZ_1M;
9459         else
9460                 min_allocable_bytes = 0;
9461
9462         spin_lock(&sinfo->lock);
9463         spin_lock(&cache->lock);
9464
9465         if (cache->ro) {
9466                 cache->ro++;
9467                 ret = 0;
9468                 goto out;
9469         }
9470
9471         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9472                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9473         sinfo_used = btrfs_space_info_used(sinfo, true);
9474
9475         if (sinfo_used + num_bytes + min_allocable_bytes <=
9476             sinfo->total_bytes) {
9477                 sinfo->bytes_readonly += num_bytes;
9478                 cache->ro++;
9479                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9480                 ret = 0;
9481         }
9482 out:
9483         spin_unlock(&cache->lock);
9484         spin_unlock(&sinfo->lock);
9485         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
9486                 btrfs_info(cache->fs_info,
9487                         "unable to make block group %llu ro",
9488                         cache->key.objectid);
9489                 btrfs_info(cache->fs_info,
9490                         "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
9491                         sinfo_used, num_bytes, min_allocable_bytes);
9492                 dump_space_info(cache->fs_info, cache->space_info, 0, 0);
9493         }
9494         return ret;
9495 }
9496
9497 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9498
9499 {
9500         struct btrfs_fs_info *fs_info = cache->fs_info;
9501         struct btrfs_trans_handle *trans;
9502         u64 alloc_flags;
9503         int ret;
9504
9505 again:
9506         trans = btrfs_join_transaction(fs_info->extent_root);
9507         if (IS_ERR(trans))
9508                 return PTR_ERR(trans);
9509
9510         /*
9511          * we're not allowed to set block groups readonly after the dirty
9512          * block groups cache has started writing.  If it already started,
9513          * back off and let this transaction commit
9514          */
9515         mutex_lock(&fs_info->ro_block_group_mutex);
9516         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9517                 u64 transid = trans->transid;
9518
9519                 mutex_unlock(&fs_info->ro_block_group_mutex);
9520                 btrfs_end_transaction(trans);
9521
9522                 ret = btrfs_wait_for_commit(fs_info, transid);
9523                 if (ret)
9524                         return ret;
9525                 goto again;
9526         }
9527
9528         /*
9529          * if we are changing raid levels, try to allocate a corresponding
9530          * block group with the new raid level.
9531          */
9532         alloc_flags = update_block_group_flags(fs_info, cache->flags);
9533         if (alloc_flags != cache->flags) {
9534                 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9535                 /*
9536                  * ENOSPC is allowed here, we may have enough space
9537                  * already allocated at the new raid level to
9538                  * carry on
9539                  */
9540                 if (ret == -ENOSPC)
9541                         ret = 0;
9542                 if (ret < 0)
9543                         goto out;
9544         }
9545
9546         ret = inc_block_group_ro(cache, 0);
9547         if (!ret)
9548                 goto out;
9549         alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9550         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9551         if (ret < 0)
9552                 goto out;
9553         ret = inc_block_group_ro(cache, 0);
9554 out:
9555         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9556                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9557                 mutex_lock(&fs_info->chunk_mutex);
9558                 check_system_chunk(trans, alloc_flags);
9559                 mutex_unlock(&fs_info->chunk_mutex);
9560         }
9561         mutex_unlock(&fs_info->ro_block_group_mutex);
9562
9563         btrfs_end_transaction(trans);
9564         return ret;
9565 }
9566
9567 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9568 {
9569         u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9570
9571         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9572 }
9573
9574 /*
9575  * helper to account the unused space of all the readonly block group in the
9576  * space_info. takes mirrors into account.
9577  */
9578 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9579 {
9580         struct btrfs_block_group_cache *block_group;
9581         u64 free_bytes = 0;
9582         int factor;
9583
9584         /* It's df, we don't care if it's racy */
9585         if (list_empty(&sinfo->ro_bgs))
9586                 return 0;
9587
9588         spin_lock(&sinfo->lock);
9589         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9590                 spin_lock(&block_group->lock);
9591
9592                 if (!block_group->ro) {
9593                         spin_unlock(&block_group->lock);
9594                         continue;
9595                 }
9596
9597                 factor = btrfs_bg_type_to_factor(block_group->flags);
9598                 free_bytes += (block_group->key.offset -
9599                                btrfs_block_group_used(&block_group->item)) *
9600                                factor;
9601
9602                 spin_unlock(&block_group->lock);
9603         }
9604         spin_unlock(&sinfo->lock);
9605
9606         return free_bytes;
9607 }
9608
9609 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9610 {
9611         struct btrfs_space_info *sinfo = cache->space_info;
9612         u64 num_bytes;
9613
9614         BUG_ON(!cache->ro);
9615
9616         spin_lock(&sinfo->lock);
9617         spin_lock(&cache->lock);
9618         if (!--cache->ro) {
9619                 num_bytes = cache->key.offset - cache->reserved -
9620                             cache->pinned - cache->bytes_super -
9621                             btrfs_block_group_used(&cache->item);
9622                 sinfo->bytes_readonly -= num_bytes;
9623                 list_del_init(&cache->ro_list);
9624         }
9625         spin_unlock(&cache->lock);
9626         spin_unlock(&sinfo->lock);
9627 }
9628
9629 /*
9630  * Checks to see if it's even possible to relocate this block group.
9631  *
9632  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9633  * ok to go ahead and try.
9634  */
9635 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9636 {
9637         struct btrfs_block_group_cache *block_group;
9638         struct btrfs_space_info *space_info;
9639         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9640         struct btrfs_device *device;
9641         u64 min_free;
9642         u64 dev_min = 1;
9643         u64 dev_nr = 0;
9644         u64 target;
9645         int debug;
9646         int index;
9647         int full = 0;
9648         int ret = 0;
9649
9650         debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9651
9652         block_group = btrfs_lookup_block_group(fs_info, bytenr);
9653
9654         /* odd, couldn't find the block group, leave it alone */
9655         if (!block_group) {
9656                 if (debug)
9657                         btrfs_warn(fs_info,
9658                                    "can't find block group for bytenr %llu",
9659                                    bytenr);
9660                 return -1;
9661         }
9662
9663         min_free = btrfs_block_group_used(&block_group->item);
9664
9665         /* no bytes used, we're good */
9666         if (!min_free)
9667                 goto out;
9668
9669         space_info = block_group->space_info;
9670         spin_lock(&space_info->lock);
9671
9672         full = space_info->full;
9673
9674         /*
9675          * if this is the last block group we have in this space, we can't
9676          * relocate it unless we're able to allocate a new chunk below.
9677          *
9678          * Otherwise, we need to make sure we have room in the space to handle
9679          * all of the extents from this block group.  If we can, we're good
9680          */
9681         if ((space_info->total_bytes != block_group->key.offset) &&
9682             (btrfs_space_info_used(space_info, false) + min_free <
9683              space_info->total_bytes)) {
9684                 spin_unlock(&space_info->lock);
9685                 goto out;
9686         }
9687         spin_unlock(&space_info->lock);
9688
9689         /*
9690          * ok we don't have enough space, but maybe we have free space on our
9691          * devices to allocate new chunks for relocation, so loop through our
9692          * alloc devices and guess if we have enough space.  if this block
9693          * group is going to be restriped, run checks against the target
9694          * profile instead of the current one.
9695          */
9696         ret = -1;
9697
9698         /*
9699          * index:
9700          *      0: raid10
9701          *      1: raid1
9702          *      2: dup
9703          *      3: raid0
9704          *      4: single
9705          */
9706         target = get_restripe_target(fs_info, block_group->flags);
9707         if (target) {
9708                 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9709         } else {
9710                 /*
9711                  * this is just a balance, so if we were marked as full
9712                  * we know there is no space for a new chunk
9713                  */
9714                 if (full) {
9715                         if (debug)
9716                                 btrfs_warn(fs_info,
9717                                            "no space to alloc new chunk for block group %llu",
9718                                            block_group->key.objectid);
9719                         goto out;
9720                 }
9721
9722                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
9723         }
9724
9725         if (index == BTRFS_RAID_RAID10) {
9726                 dev_min = 4;
9727                 /* Divide by 2 */
9728                 min_free >>= 1;
9729         } else if (index == BTRFS_RAID_RAID1) {
9730                 dev_min = 2;
9731         } else if (index == BTRFS_RAID_DUP) {
9732                 /* Multiply by 2 */
9733                 min_free <<= 1;
9734         } else if (index == BTRFS_RAID_RAID0) {
9735                 dev_min = fs_devices->rw_devices;
9736                 min_free = div64_u64(min_free, dev_min);
9737         }
9738
9739         mutex_lock(&fs_info->chunk_mutex);
9740         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9741                 u64 dev_offset;
9742
9743                 /*
9744                  * check to make sure we can actually find a chunk with enough
9745                  * space to fit our block group in.
9746                  */
9747                 if (device->total_bytes > device->bytes_used + min_free &&
9748                     !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9749                         ret = find_free_dev_extent(device, min_free,
9750                                                    &dev_offset, NULL);
9751                         if (!ret)
9752                                 dev_nr++;
9753
9754                         if (dev_nr >= dev_min)
9755                                 break;
9756
9757                         ret = -1;
9758                 }
9759         }
9760         if (debug && ret == -1)
9761                 btrfs_warn(fs_info,
9762                            "no space to allocate a new chunk for block group %llu",
9763                            block_group->key.objectid);
9764         mutex_unlock(&fs_info->chunk_mutex);
9765 out:
9766         btrfs_put_block_group(block_group);
9767         return ret;
9768 }
9769
9770 static int find_first_block_group(struct btrfs_fs_info *fs_info,
9771                                   struct btrfs_path *path,
9772                                   struct btrfs_key *key)
9773 {
9774         struct btrfs_root *root = fs_info->extent_root;
9775         int ret = 0;
9776         struct btrfs_key found_key;
9777         struct extent_buffer *leaf;
9778         struct btrfs_block_group_item bg;
9779         u64 flags;
9780         int slot;
9781
9782         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9783         if (ret < 0)
9784                 goto out;
9785
9786         while (1) {
9787                 slot = path->slots[0];
9788                 leaf = path->nodes[0];
9789                 if (slot >= btrfs_header_nritems(leaf)) {
9790                         ret = btrfs_next_leaf(root, path);
9791                         if (ret == 0)
9792                                 continue;
9793                         if (ret < 0)
9794                                 goto out;
9795                         break;
9796                 }
9797                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9798
9799                 if (found_key.objectid >= key->objectid &&
9800                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9801                         struct extent_map_tree *em_tree;
9802                         struct extent_map *em;
9803
9804                         em_tree = &root->fs_info->mapping_tree;
9805                         read_lock(&em_tree->lock);
9806                         em = lookup_extent_mapping(em_tree, found_key.objectid,
9807                                                    found_key.offset);
9808                         read_unlock(&em_tree->lock);
9809                         if (!em) {
9810                                 btrfs_err(fs_info,
9811                         "logical %llu len %llu found bg but no related chunk",
9812                                           found_key.objectid, found_key.offset);
9813                                 ret = -ENOENT;
9814                         } else if (em->start != found_key.objectid ||
9815                                    em->len != found_key.offset) {
9816                                 btrfs_err(fs_info,
9817                 "block group %llu len %llu mismatch with chunk %llu len %llu",
9818                                           found_key.objectid, found_key.offset,
9819                                           em->start, em->len);
9820                                 ret = -EUCLEAN;
9821                         } else {
9822                                 read_extent_buffer(leaf, &bg,
9823                                         btrfs_item_ptr_offset(leaf, slot),
9824                                         sizeof(bg));
9825                                 flags = btrfs_block_group_flags(&bg) &
9826                                         BTRFS_BLOCK_GROUP_TYPE_MASK;
9827
9828                                 if (flags != (em->map_lookup->type &
9829                                               BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9830                                         btrfs_err(fs_info,
9831 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9832                                                 found_key.objectid,
9833                                                 found_key.offset, flags,
9834                                                 (BTRFS_BLOCK_GROUP_TYPE_MASK &
9835                                                  em->map_lookup->type));
9836                                         ret = -EUCLEAN;
9837                                 } else {
9838                                         ret = 0;
9839                                 }
9840                         }
9841                         free_extent_map(em);
9842                         goto out;
9843                 }
9844                 path->slots[0]++;
9845         }
9846 out:
9847         return ret;
9848 }
9849
9850 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9851 {
9852         struct btrfs_block_group_cache *block_group;
9853         u64 last = 0;
9854
9855         while (1) {
9856                 struct inode *inode;
9857
9858                 block_group = btrfs_lookup_first_block_group(info, last);
9859                 while (block_group) {
9860                         wait_block_group_cache_done(block_group);
9861                         spin_lock(&block_group->lock);
9862                         if (block_group->iref)
9863                                 break;
9864                         spin_unlock(&block_group->lock);
9865                         block_group = next_block_group(block_group);
9866                 }
9867                 if (!block_group) {
9868                         if (last == 0)
9869                                 break;
9870                         last = 0;
9871                         continue;
9872                 }
9873
9874                 inode = block_group->inode;
9875                 block_group->iref = 0;
9876                 block_group->inode = NULL;
9877                 spin_unlock(&block_group->lock);
9878                 ASSERT(block_group->io_ctl.inode == NULL);
9879                 iput(inode);
9880                 last = block_group->key.objectid + block_group->key.offset;
9881                 btrfs_put_block_group(block_group);
9882         }
9883 }
9884
9885 /*
9886  * Must be called only after stopping all workers, since we could have block
9887  * group caching kthreads running, and therefore they could race with us if we
9888  * freed the block groups before stopping them.
9889  */
9890 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9891 {
9892         struct btrfs_block_group_cache *block_group;
9893         struct btrfs_space_info *space_info;
9894         struct btrfs_caching_control *caching_ctl;
9895         struct rb_node *n;
9896
9897         down_write(&info->commit_root_sem);
9898         while (!list_empty(&info->caching_block_groups)) {
9899                 caching_ctl = list_entry(info->caching_block_groups.next,
9900                                          struct btrfs_caching_control, list);
9901                 list_del(&caching_ctl->list);
9902                 put_caching_control(caching_ctl);
9903         }
9904         up_write(&info->commit_root_sem);
9905
9906         spin_lock(&info->unused_bgs_lock);
9907         while (!list_empty(&info->unused_bgs)) {
9908                 block_group = list_first_entry(&info->unused_bgs,
9909                                                struct btrfs_block_group_cache,
9910                                                bg_list);
9911                 list_del_init(&block_group->bg_list);
9912                 btrfs_put_block_group(block_group);
9913         }
9914         spin_unlock(&info->unused_bgs_lock);
9915
9916         spin_lock(&info->block_group_cache_lock);
9917         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9918                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9919                                        cache_node);
9920                 rb_erase(&block_group->cache_node,
9921                          &info->block_group_cache_tree);
9922                 RB_CLEAR_NODE(&block_group->cache_node);
9923                 spin_unlock(&info->block_group_cache_lock);
9924
9925                 down_write(&block_group->space_info->groups_sem);
9926                 list_del(&block_group->list);
9927                 up_write(&block_group->space_info->groups_sem);
9928
9929                 /*
9930                  * We haven't cached this block group, which means we could
9931                  * possibly have excluded extents on this block group.
9932                  */
9933                 if (block_group->cached == BTRFS_CACHE_NO ||
9934                     block_group->cached == BTRFS_CACHE_ERROR)
9935                         free_excluded_extents(block_group);
9936
9937                 btrfs_remove_free_space_cache(block_group);
9938                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9939                 ASSERT(list_empty(&block_group->dirty_list));
9940                 ASSERT(list_empty(&block_group->io_list));
9941                 ASSERT(list_empty(&block_group->bg_list));
9942                 ASSERT(atomic_read(&block_group->count) == 1);
9943                 btrfs_put_block_group(block_group);
9944
9945                 spin_lock(&info->block_group_cache_lock);
9946         }
9947         spin_unlock(&info->block_group_cache_lock);
9948
9949         /* now that all the block groups are freed, go through and
9950          * free all the space_info structs.  This is only called during
9951          * the final stages of unmount, and so we know nobody is
9952          * using them.  We call synchronize_rcu() once before we start,
9953          * just to be on the safe side.
9954          */
9955         synchronize_rcu();
9956
9957         release_global_block_rsv(info);
9958
9959         while (!list_empty(&info->space_info)) {
9960                 int i;
9961
9962                 space_info = list_entry(info->space_info.next,
9963                                         struct btrfs_space_info,
9964                                         list);
9965
9966                 /*
9967                  * Do not hide this behind enospc_debug, this is actually
9968                  * important and indicates a real bug if this happens.
9969                  */
9970                 if (WARN_ON(space_info->bytes_pinned > 0 ||
9971                             space_info->bytes_reserved > 0 ||
9972                             space_info->bytes_may_use > 0))
9973                         dump_space_info(info, space_info, 0, 0);
9974                 list_del(&space_info->list);
9975                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9976                         struct kobject *kobj;
9977                         kobj = space_info->block_group_kobjs[i];
9978                         space_info->block_group_kobjs[i] = NULL;
9979                         if (kobj) {
9980                                 kobject_del(kobj);
9981                                 kobject_put(kobj);
9982                         }
9983                 }
9984                 kobject_del(&space_info->kobj);
9985                 kobject_put(&space_info->kobj);
9986         }
9987         return 0;
9988 }
9989
9990 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
9991 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9992 {
9993         struct btrfs_space_info *space_info;
9994         struct raid_kobject *rkobj;
9995         LIST_HEAD(list);
9996         int ret = 0;
9997
9998         spin_lock(&fs_info->pending_raid_kobjs_lock);
9999         list_splice_init(&fs_info->pending_raid_kobjs, &list);
10000         spin_unlock(&fs_info->pending_raid_kobjs_lock);
10001
10002         list_for_each_entry(rkobj, &list, list) {
10003                 space_info = btrfs_find_space_info(fs_info, rkobj->flags);
10004
10005                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10006                                 "%s", btrfs_bg_type_to_raid_name(rkobj->flags));
10007                 if (ret) {
10008                         kobject_put(&rkobj->kobj);
10009                         break;
10010                 }
10011         }
10012         if (ret)
10013                 btrfs_warn(fs_info,
10014                            "failed to add kobject for block cache, ignoring");
10015 }
10016
10017 static void link_block_group(struct btrfs_block_group_cache *cache)
10018 {
10019         struct btrfs_space_info *space_info = cache->space_info;
10020         struct btrfs_fs_info *fs_info = cache->fs_info;
10021         int index = btrfs_bg_flags_to_raid_index(cache->flags);
10022         bool first = false;
10023
10024         down_write(&space_info->groups_sem);
10025         if (list_empty(&space_info->block_groups[index]))
10026                 first = true;
10027         list_add_tail(&cache->list, &space_info->block_groups[index]);
10028         up_write(&space_info->groups_sem);
10029
10030         if (first) {
10031                 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10032                 if (!rkobj) {
10033                         btrfs_warn(cache->fs_info,
10034                                 "couldn't alloc memory for raid level kobject");
10035                         return;
10036                 }
10037                 rkobj->flags = cache->flags;
10038                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10039
10040                 spin_lock(&fs_info->pending_raid_kobjs_lock);
10041                 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
10042                 spin_unlock(&fs_info->pending_raid_kobjs_lock);
10043                 space_info->block_group_kobjs[index] = &rkobj->kobj;
10044         }
10045 }
10046
10047 static struct btrfs_block_group_cache *
10048 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10049                                u64 start, u64 size)
10050 {
10051         struct btrfs_block_group_cache *cache;
10052
10053         cache = kzalloc(sizeof(*cache), GFP_NOFS);
10054         if (!cache)
10055                 return NULL;
10056
10057         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10058                                         GFP_NOFS);
10059         if (!cache->free_space_ctl) {
10060                 kfree(cache);
10061                 return NULL;
10062         }
10063
10064         cache->key.objectid = start;
10065         cache->key.offset = size;
10066         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10067
10068         cache->fs_info = fs_info;
10069         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
10070         set_free_space_tree_thresholds(cache);
10071
10072         atomic_set(&cache->count, 1);
10073         spin_lock_init(&cache->lock);
10074         init_rwsem(&cache->data_rwsem);
10075         INIT_LIST_HEAD(&cache->list);
10076         INIT_LIST_HEAD(&cache->cluster_list);
10077         INIT_LIST_HEAD(&cache->bg_list);
10078         INIT_LIST_HEAD(&cache->ro_list);
10079         INIT_LIST_HEAD(&cache->dirty_list);
10080         INIT_LIST_HEAD(&cache->io_list);
10081         btrfs_init_free_space_ctl(cache);
10082         atomic_set(&cache->trimming, 0);
10083         mutex_init(&cache->free_space_lock);
10084         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10085
10086         return cache;
10087 }
10088
10089
10090 /*
10091  * Iterate all chunks and verify that each of them has the corresponding block
10092  * group
10093  */
10094 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
10095 {
10096         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
10097         struct extent_map *em;
10098         struct btrfs_block_group_cache *bg;
10099         u64 start = 0;
10100         int ret = 0;
10101
10102         while (1) {
10103                 read_lock(&map_tree->lock);
10104                 /*
10105                  * lookup_extent_mapping will return the first extent map
10106                  * intersecting the range, so setting @len to 1 is enough to
10107                  * get the first chunk.
10108                  */
10109                 em = lookup_extent_mapping(map_tree, start, 1);
10110                 read_unlock(&map_tree->lock);
10111                 if (!em)
10112                         break;
10113
10114                 bg = btrfs_lookup_block_group(fs_info, em->start);
10115                 if (!bg) {
10116                         btrfs_err(fs_info,
10117         "chunk start=%llu len=%llu doesn't have corresponding block group",
10118                                      em->start, em->len);
10119                         ret = -EUCLEAN;
10120                         free_extent_map(em);
10121                         break;
10122                 }
10123                 if (bg->key.objectid != em->start ||
10124                     bg->key.offset != em->len ||
10125                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
10126                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10127                         btrfs_err(fs_info,
10128 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
10129                                 em->start, em->len,
10130                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
10131                                 bg->key.objectid, bg->key.offset,
10132                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
10133                         ret = -EUCLEAN;
10134                         free_extent_map(em);
10135                         btrfs_put_block_group(bg);
10136                         break;
10137                 }
10138                 start = em->start + em->len;
10139                 free_extent_map(em);
10140                 btrfs_put_block_group(bg);
10141         }
10142         return ret;
10143 }
10144
10145 int btrfs_read_block_groups(struct btrfs_fs_info *info)
10146 {
10147         struct btrfs_path *path;
10148         int ret;
10149         struct btrfs_block_group_cache *cache;
10150         struct btrfs_space_info *space_info;
10151         struct btrfs_key key;
10152         struct btrfs_key found_key;
10153         struct extent_buffer *leaf;
10154         int need_clear = 0;
10155         u64 cache_gen;
10156         u64 feature;
10157         int mixed;
10158
10159         feature = btrfs_super_incompat_flags(info->super_copy);
10160         mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10161
10162         key.objectid = 0;
10163         key.offset = 0;
10164         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10165         path = btrfs_alloc_path();
10166         if (!path)
10167                 return -ENOMEM;
10168         path->reada = READA_FORWARD;
10169
10170         cache_gen = btrfs_super_cache_generation(info->super_copy);
10171         if (btrfs_test_opt(info, SPACE_CACHE) &&
10172             btrfs_super_generation(info->super_copy) != cache_gen)
10173                 need_clear = 1;
10174         if (btrfs_test_opt(info, CLEAR_CACHE))
10175                 need_clear = 1;
10176
10177         while (1) {
10178                 ret = find_first_block_group(info, path, &key);
10179                 if (ret > 0)
10180                         break;
10181                 if (ret != 0)
10182                         goto error;
10183
10184                 leaf = path->nodes[0];
10185                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10186
10187                 cache = btrfs_create_block_group_cache(info, found_key.objectid,
10188                                                        found_key.offset);
10189                 if (!cache) {
10190                         ret = -ENOMEM;
10191                         goto error;
10192                 }
10193
10194                 if (need_clear) {
10195                         /*
10196                          * When we mount with old space cache, we need to
10197                          * set BTRFS_DC_CLEAR and set dirty flag.
10198                          *
10199                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10200                          *    truncate the old free space cache inode and
10201                          *    setup a new one.
10202                          * b) Setting 'dirty flag' makes sure that we flush
10203                          *    the new space cache info onto disk.
10204                          */
10205                         if (btrfs_test_opt(info, SPACE_CACHE))
10206                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
10207                 }
10208
10209                 read_extent_buffer(leaf, &cache->item,
10210                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
10211                                    sizeof(cache->item));
10212                 cache->flags = btrfs_block_group_flags(&cache->item);
10213                 if (!mixed &&
10214                     ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10215                     (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10216                         btrfs_err(info,
10217 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10218                                   cache->key.objectid);
10219                         ret = -EINVAL;
10220                         goto error;
10221                 }
10222
10223                 key.objectid = found_key.objectid + found_key.offset;
10224                 btrfs_release_path(path);
10225
10226                 /*
10227                  * We need to exclude the super stripes now so that the space
10228                  * info has super bytes accounted for, otherwise we'll think
10229                  * we have more space than we actually do.
10230                  */
10231                 ret = exclude_super_stripes(cache);
10232                 if (ret) {
10233                         /*
10234                          * We may have excluded something, so call this just in
10235                          * case.
10236                          */
10237                         free_excluded_extents(cache);
10238                         btrfs_put_block_group(cache);
10239                         goto error;
10240                 }
10241
10242                 /*
10243                  * check for two cases, either we are full, and therefore
10244                  * don't need to bother with the caching work since we won't
10245                  * find any space, or we are empty, and we can just add all
10246                  * the space in and be done with it.  This saves us _a_lot_ of
10247                  * time, particularly in the full case.
10248                  */
10249                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10250                         cache->last_byte_to_unpin = (u64)-1;
10251                         cache->cached = BTRFS_CACHE_FINISHED;
10252                         free_excluded_extents(cache);
10253                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10254                         cache->last_byte_to_unpin = (u64)-1;
10255                         cache->cached = BTRFS_CACHE_FINISHED;
10256                         add_new_free_space(cache, found_key.objectid,
10257                                            found_key.objectid +
10258                                            found_key.offset);
10259                         free_excluded_extents(cache);
10260                 }
10261
10262                 ret = btrfs_add_block_group_cache(info, cache);
10263                 if (ret) {
10264                         btrfs_remove_free_space_cache(cache);
10265                         btrfs_put_block_group(cache);
10266                         goto error;
10267                 }
10268
10269                 trace_btrfs_add_block_group(info, cache, 0);
10270                 btrfs_update_space_info(info, cache->flags, found_key.offset,
10271                                         btrfs_block_group_used(&cache->item),
10272                                         cache->bytes_super, &space_info);
10273
10274                 cache->space_info = space_info;
10275
10276                 link_block_group(cache);
10277
10278                 set_avail_alloc_bits(info, cache->flags);
10279                 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10280                         inc_block_group_ro(cache, 1);
10281                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10282                         ASSERT(list_empty(&cache->bg_list));
10283                         btrfs_mark_bg_unused(cache);
10284                 }
10285         }
10286
10287         list_for_each_entry_rcu(space_info, &info->space_info, list) {
10288                 if (!(get_alloc_profile(info, space_info->flags) &
10289                       (BTRFS_BLOCK_GROUP_RAID10 |
10290                        BTRFS_BLOCK_GROUP_RAID1_MASK |
10291                        BTRFS_BLOCK_GROUP_RAID56_MASK |
10292                        BTRFS_BLOCK_GROUP_DUP)))
10293                         continue;
10294                 /*
10295                  * avoid allocating from un-mirrored block group if there are
10296                  * mirrored block groups.
10297                  */
10298                 list_for_each_entry(cache,
10299                                 &space_info->block_groups[BTRFS_RAID_RAID0],
10300                                 list)
10301                         inc_block_group_ro(cache, 1);
10302                 list_for_each_entry(cache,
10303                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
10304                                 list)
10305                         inc_block_group_ro(cache, 1);
10306         }
10307
10308         btrfs_add_raid_kobjects(info);
10309         init_global_block_rsv(info);
10310         ret = check_chunk_block_group_mappings(info);
10311 error:
10312         btrfs_free_path(path);
10313         return ret;
10314 }
10315
10316 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10317 {
10318         struct btrfs_fs_info *fs_info = trans->fs_info;
10319         struct btrfs_block_group_cache *block_group;
10320         struct btrfs_root *extent_root = fs_info->extent_root;
10321         struct btrfs_block_group_item item;
10322         struct btrfs_key key;
10323         int ret = 0;
10324
10325         if (!trans->can_flush_pending_bgs)
10326                 return;
10327
10328         while (!list_empty(&trans->new_bgs)) {
10329                 block_group = list_first_entry(&trans->new_bgs,
10330                                                struct btrfs_block_group_cache,
10331                                                bg_list);
10332                 if (ret)
10333                         goto next;
10334
10335                 spin_lock(&block_group->lock);
10336                 memcpy(&item, &block_group->item, sizeof(item));
10337                 memcpy(&key, &block_group->key, sizeof(key));
10338                 spin_unlock(&block_group->lock);
10339
10340                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10341                                         sizeof(item));
10342                 if (ret)
10343                         btrfs_abort_transaction(trans, ret);
10344                 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10345                 if (ret)
10346                         btrfs_abort_transaction(trans, ret);
10347                 add_block_group_free_space(trans, block_group);
10348                 /* already aborted the transaction if it failed. */
10349 next:
10350                 btrfs_delayed_refs_rsv_release(fs_info, 1);
10351                 list_del_init(&block_group->bg_list);
10352         }
10353         btrfs_trans_release_chunk_metadata(trans);
10354 }
10355
10356 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10357                            u64 type, u64 chunk_offset, u64 size)
10358 {
10359         struct btrfs_fs_info *fs_info = trans->fs_info;
10360         struct btrfs_block_group_cache *cache;
10361         int ret;
10362
10363         btrfs_set_log_full_commit(trans);
10364
10365         cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10366         if (!cache)
10367                 return -ENOMEM;
10368
10369         btrfs_set_block_group_used(&cache->item, bytes_used);
10370         btrfs_set_block_group_chunk_objectid(&cache->item,
10371                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10372         btrfs_set_block_group_flags(&cache->item, type);
10373
10374         cache->flags = type;
10375         cache->last_byte_to_unpin = (u64)-1;
10376         cache->cached = BTRFS_CACHE_FINISHED;
10377         cache->needs_free_space = 1;
10378         ret = exclude_super_stripes(cache);
10379         if (ret) {
10380                 /*
10381                  * We may have excluded something, so call this just in
10382                  * case.
10383                  */
10384                 free_excluded_extents(cache);
10385                 btrfs_put_block_group(cache);
10386                 return ret;
10387         }
10388
10389         add_new_free_space(cache, chunk_offset, chunk_offset + size);
10390
10391         free_excluded_extents(cache);
10392
10393 #ifdef CONFIG_BTRFS_DEBUG
10394         if (btrfs_should_fragment_free_space(cache)) {
10395                 u64 new_bytes_used = size - bytes_used;
10396
10397                 bytes_used += new_bytes_used >> 1;
10398                 fragment_free_space(cache);
10399         }
10400 #endif
10401         /*
10402          * Ensure the corresponding space_info object is created and
10403          * assigned to our block group. We want our bg to be added to the rbtree
10404          * with its ->space_info set.
10405          */
10406         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
10407         ASSERT(cache->space_info);
10408
10409         ret = btrfs_add_block_group_cache(fs_info, cache);
10410         if (ret) {
10411                 btrfs_remove_free_space_cache(cache);
10412                 btrfs_put_block_group(cache);
10413                 return ret;
10414         }
10415
10416         /*
10417          * Now that our block group has its ->space_info set and is inserted in
10418          * the rbtree, update the space info's counters.
10419          */
10420         trace_btrfs_add_block_group(fs_info, cache, 1);
10421         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
10422                                 cache->bytes_super, &cache->space_info);
10423         update_global_block_rsv(fs_info);
10424
10425         link_block_group(cache);
10426
10427         list_add_tail(&cache->bg_list, &trans->new_bgs);
10428         trans->delayed_ref_updates++;
10429         btrfs_update_delayed_refs_rsv(trans);
10430
10431         set_avail_alloc_bits(fs_info, type);
10432         return 0;
10433 }
10434
10435 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10436 {
10437         u64 extra_flags = chunk_to_extended(flags) &
10438                                 BTRFS_EXTENDED_PROFILE_MASK;
10439
10440         write_seqlock(&fs_info->profiles_lock);
10441         if (flags & BTRFS_BLOCK_GROUP_DATA)
10442                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10443         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10444                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10445         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10446                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10447         write_sequnlock(&fs_info->profiles_lock);
10448 }
10449
10450 /*
10451  * Clear incompat bits for the following feature(s):
10452  *
10453  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
10454  *            in the whole filesystem
10455  */
10456 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
10457 {
10458         if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
10459                 struct list_head *head = &fs_info->space_info;
10460                 struct btrfs_space_info *sinfo;
10461
10462                 list_for_each_entry_rcu(sinfo, head, list) {
10463                         bool found = false;
10464
10465                         down_read(&sinfo->groups_sem);
10466                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
10467                                 found = true;
10468                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
10469                                 found = true;
10470                         up_read(&sinfo->groups_sem);
10471
10472                         if (found)
10473                                 return;
10474                 }
10475                 btrfs_clear_fs_incompat(fs_info, RAID56);
10476         }
10477 }
10478
10479 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10480                              u64 group_start, struct extent_map *em)
10481 {
10482         struct btrfs_fs_info *fs_info = trans->fs_info;
10483         struct btrfs_root *root = fs_info->extent_root;
10484         struct btrfs_path *path;
10485         struct btrfs_block_group_cache *block_group;
10486         struct btrfs_free_cluster *cluster;
10487         struct btrfs_root *tree_root = fs_info->tree_root;
10488         struct btrfs_key key;
10489         struct inode *inode;
10490         struct kobject *kobj = NULL;
10491         int ret;
10492         int index;
10493         int factor;
10494         struct btrfs_caching_control *caching_ctl = NULL;
10495         bool remove_em;
10496         bool remove_rsv = false;
10497
10498         block_group = btrfs_lookup_block_group(fs_info, group_start);
10499         BUG_ON(!block_group);
10500         BUG_ON(!block_group->ro);
10501
10502         trace_btrfs_remove_block_group(block_group);
10503         /*
10504          * Free the reserved super bytes from this block group before
10505          * remove it.
10506          */
10507         free_excluded_extents(block_group);
10508         btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10509                                   block_group->key.offset);
10510
10511         memcpy(&key, &block_group->key, sizeof(key));
10512         index = btrfs_bg_flags_to_raid_index(block_group->flags);
10513         factor = btrfs_bg_type_to_factor(block_group->flags);
10514
10515         /* make sure this block group isn't part of an allocation cluster */
10516         cluster = &fs_info->data_alloc_cluster;
10517         spin_lock(&cluster->refill_lock);
10518         btrfs_return_cluster_to_free_space(block_group, cluster);
10519         spin_unlock(&cluster->refill_lock);
10520
10521         /*
10522          * make sure this block group isn't part of a metadata
10523          * allocation cluster
10524          */
10525         cluster = &fs_info->meta_alloc_cluster;
10526         spin_lock(&cluster->refill_lock);
10527         btrfs_return_cluster_to_free_space(block_group, cluster);
10528         spin_unlock(&cluster->refill_lock);
10529
10530         path = btrfs_alloc_path();
10531         if (!path) {
10532                 ret = -ENOMEM;
10533                 goto out;
10534         }
10535
10536         /*
10537          * get the inode first so any iput calls done for the io_list
10538          * aren't the final iput (no unlinks allowed now)
10539          */
10540         inode = lookup_free_space_inode(block_group, path);
10541
10542         mutex_lock(&trans->transaction->cache_write_mutex);
10543         /*
10544          * Make sure our free space cache IO is done before removing the
10545          * free space inode
10546          */
10547         spin_lock(&trans->transaction->dirty_bgs_lock);
10548         if (!list_empty(&block_group->io_list)) {
10549                 list_del_init(&block_group->io_list);
10550
10551                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10552
10553                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10554                 btrfs_wait_cache_io(trans, block_group, path);
10555                 btrfs_put_block_group(block_group);
10556                 spin_lock(&trans->transaction->dirty_bgs_lock);
10557         }
10558
10559         if (!list_empty(&block_group->dirty_list)) {
10560                 list_del_init(&block_group->dirty_list);
10561                 remove_rsv = true;
10562                 btrfs_put_block_group(block_group);
10563         }
10564         spin_unlock(&trans->transaction->dirty_bgs_lock);
10565         mutex_unlock(&trans->transaction->cache_write_mutex);
10566
10567         if (!IS_ERR(inode)) {
10568                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10569                 if (ret) {
10570                         btrfs_add_delayed_iput(inode);
10571                         goto out;
10572                 }
10573                 clear_nlink(inode);
10574                 /* One for the block groups ref */
10575                 spin_lock(&block_group->lock);
10576                 if (block_group->iref) {
10577                         block_group->iref = 0;
10578                         block_group->inode = NULL;
10579                         spin_unlock(&block_group->lock);
10580                         iput(inode);
10581                 } else {
10582                         spin_unlock(&block_group->lock);
10583                 }
10584                 /* One for our lookup ref */
10585                 btrfs_add_delayed_iput(inode);
10586         }
10587
10588         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10589         key.offset = block_group->key.objectid;
10590         key.type = 0;
10591
10592         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10593         if (ret < 0)
10594                 goto out;
10595         if (ret > 0)
10596                 btrfs_release_path(path);
10597         if (ret == 0) {
10598                 ret = btrfs_del_item(trans, tree_root, path);
10599                 if (ret)
10600                         goto out;
10601                 btrfs_release_path(path);
10602         }
10603
10604         spin_lock(&fs_info->block_group_cache_lock);
10605         rb_erase(&block_group->cache_node,
10606                  &fs_info->block_group_cache_tree);
10607         RB_CLEAR_NODE(&block_group->cache_node);
10608
10609         if (fs_info->first_logical_byte == block_group->key.objectid)
10610                 fs_info->first_logical_byte = (u64)-1;
10611         spin_unlock(&fs_info->block_group_cache_lock);
10612
10613         down_write(&block_group->space_info->groups_sem);
10614         /*
10615          * we must use list_del_init so people can check to see if they
10616          * are still on the list after taking the semaphore
10617          */
10618         list_del_init(&block_group->list);
10619         if (list_empty(&block_group->space_info->block_groups[index])) {
10620                 kobj = block_group->space_info->block_group_kobjs[index];
10621                 block_group->space_info->block_group_kobjs[index] = NULL;
10622                 clear_avail_alloc_bits(fs_info, block_group->flags);
10623         }
10624         up_write(&block_group->space_info->groups_sem);
10625         clear_incompat_bg_bits(fs_info, block_group->flags);
10626         if (kobj) {
10627                 kobject_del(kobj);
10628                 kobject_put(kobj);
10629         }
10630
10631         if (block_group->has_caching_ctl)
10632                 caching_ctl = get_caching_control(block_group);
10633         if (block_group->cached == BTRFS_CACHE_STARTED)
10634                 wait_block_group_cache_done(block_group);
10635         if (block_group->has_caching_ctl) {
10636                 down_write(&fs_info->commit_root_sem);
10637                 if (!caching_ctl) {
10638                         struct btrfs_caching_control *ctl;
10639
10640                         list_for_each_entry(ctl,
10641                                     &fs_info->caching_block_groups, list)
10642                                 if (ctl->block_group == block_group) {
10643                                         caching_ctl = ctl;
10644                                         refcount_inc(&caching_ctl->count);
10645                                         break;
10646                                 }
10647                 }
10648                 if (caching_ctl)
10649                         list_del_init(&caching_ctl->list);
10650                 up_write(&fs_info->commit_root_sem);
10651                 if (caching_ctl) {
10652                         /* Once for the caching bgs list and once for us. */
10653                         put_caching_control(caching_ctl);
10654                         put_caching_control(caching_ctl);
10655                 }
10656         }
10657
10658         spin_lock(&trans->transaction->dirty_bgs_lock);
10659         WARN_ON(!list_empty(&block_group->dirty_list));
10660         WARN_ON(!list_empty(&block_group->io_list));
10661         spin_unlock(&trans->transaction->dirty_bgs_lock);
10662
10663         btrfs_remove_free_space_cache(block_group);
10664
10665         spin_lock(&block_group->space_info->lock);
10666         list_del_init(&block_group->ro_list);
10667
10668         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10669                 WARN_ON(block_group->space_info->total_bytes
10670                         < block_group->key.offset);
10671                 WARN_ON(block_group->space_info->bytes_readonly
10672                         < block_group->key.offset);
10673                 WARN_ON(block_group->space_info->disk_total
10674                         < block_group->key.offset * factor);
10675         }
10676         block_group->space_info->total_bytes -= block_group->key.offset;
10677         block_group->space_info->bytes_readonly -= block_group->key.offset;
10678         block_group->space_info->disk_total -= block_group->key.offset * factor;
10679
10680         spin_unlock(&block_group->space_info->lock);
10681
10682         memcpy(&key, &block_group->key, sizeof(key));
10683
10684         mutex_lock(&fs_info->chunk_mutex);
10685         spin_lock(&block_group->lock);
10686         block_group->removed = 1;
10687         /*
10688          * At this point trimming can't start on this block group, because we
10689          * removed the block group from the tree fs_info->block_group_cache_tree
10690          * so no one can't find it anymore and even if someone already got this
10691          * block group before we removed it from the rbtree, they have already
10692          * incremented block_group->trimming - if they didn't, they won't find
10693          * any free space entries because we already removed them all when we
10694          * called btrfs_remove_free_space_cache().
10695          *
10696          * And we must not remove the extent map from the fs_info->mapping_tree
10697          * to prevent the same logical address range and physical device space
10698          * ranges from being reused for a new block group. This is because our
10699          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10700          * completely transactionless, so while it is trimming a range the
10701          * currently running transaction might finish and a new one start,
10702          * allowing for new block groups to be created that can reuse the same
10703          * physical device locations unless we take this special care.
10704          *
10705          * There may also be an implicit trim operation if the file system
10706          * is mounted with -odiscard. The same protections must remain
10707          * in place until the extents have been discarded completely when
10708          * the transaction commit has completed.
10709          */
10710         remove_em = (atomic_read(&block_group->trimming) == 0);
10711         spin_unlock(&block_group->lock);
10712
10713         mutex_unlock(&fs_info->chunk_mutex);
10714
10715         ret = remove_block_group_free_space(trans, block_group);
10716         if (ret)
10717                 goto out;
10718
10719         btrfs_put_block_group(block_group);
10720         btrfs_put_block_group(block_group);
10721
10722         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10723         if (ret > 0)
10724                 ret = -EIO;
10725         if (ret < 0)
10726                 goto out;
10727
10728         ret = btrfs_del_item(trans, root, path);
10729         if (ret)
10730                 goto out;
10731
10732         if (remove_em) {
10733                 struct extent_map_tree *em_tree;
10734
10735                 em_tree = &fs_info->mapping_tree;
10736                 write_lock(&em_tree->lock);
10737                 remove_extent_mapping(em_tree, em);
10738                 write_unlock(&em_tree->lock);
10739                 /* once for the tree */
10740                 free_extent_map(em);
10741         }
10742 out:
10743         if (remove_rsv)
10744                 btrfs_delayed_refs_rsv_release(fs_info, 1);
10745         btrfs_free_path(path);
10746         return ret;
10747 }
10748
10749 struct btrfs_trans_handle *
10750 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10751                                      const u64 chunk_offset)
10752 {
10753         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
10754         struct extent_map *em;
10755         struct map_lookup *map;
10756         unsigned int num_items;
10757
10758         read_lock(&em_tree->lock);
10759         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10760         read_unlock(&em_tree->lock);
10761         ASSERT(em && em->start == chunk_offset);
10762
10763         /*
10764          * We need to reserve 3 + N units from the metadata space info in order
10765          * to remove a block group (done at btrfs_remove_chunk() and at
10766          * btrfs_remove_block_group()), which are used for:
10767          *
10768          * 1 unit for adding the free space inode's orphan (located in the tree
10769          * of tree roots).
10770          * 1 unit for deleting the block group item (located in the extent
10771          * tree).
10772          * 1 unit for deleting the free space item (located in tree of tree
10773          * roots).
10774          * N units for deleting N device extent items corresponding to each
10775          * stripe (located in the device tree).
10776          *
10777          * In order to remove a block group we also need to reserve units in the
10778          * system space info in order to update the chunk tree (update one or
10779          * more device items and remove one chunk item), but this is done at
10780          * btrfs_remove_chunk() through a call to check_system_chunk().
10781          */
10782         map = em->map_lookup;
10783         num_items = 3 + map->num_stripes;
10784         free_extent_map(em);
10785
10786         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10787                                                            num_items, 1);
10788 }
10789
10790 /*
10791  * Process the unused_bgs list and remove any that don't have any allocated
10792  * space inside of them.
10793  */
10794 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10795 {
10796         struct btrfs_block_group_cache *block_group;
10797         struct btrfs_space_info *space_info;
10798         struct btrfs_trans_handle *trans;
10799         int ret = 0;
10800
10801         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10802                 return;
10803
10804         spin_lock(&fs_info->unused_bgs_lock);
10805         while (!list_empty(&fs_info->unused_bgs)) {
10806                 u64 start, end;
10807                 int trimming;
10808
10809                 block_group = list_first_entry(&fs_info->unused_bgs,
10810                                                struct btrfs_block_group_cache,
10811                                                bg_list);
10812                 list_del_init(&block_group->bg_list);
10813
10814                 space_info = block_group->space_info;
10815
10816                 if (ret || btrfs_mixed_space_info(space_info)) {
10817                         btrfs_put_block_group(block_group);
10818                         continue;
10819                 }
10820                 spin_unlock(&fs_info->unused_bgs_lock);
10821
10822                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10823
10824                 /* Don't want to race with allocators so take the groups_sem */
10825                 down_write(&space_info->groups_sem);
10826                 spin_lock(&block_group->lock);
10827                 if (block_group->reserved || block_group->pinned ||
10828                     btrfs_block_group_used(&block_group->item) ||
10829                     block_group->ro ||
10830                     list_is_singular(&block_group->list)) {
10831                         /*
10832                          * We want to bail if we made new allocations or have
10833                          * outstanding allocations in this block group.  We do
10834                          * the ro check in case balance is currently acting on
10835                          * this block group.
10836                          */
10837                         trace_btrfs_skip_unused_block_group(block_group);
10838                         spin_unlock(&block_group->lock);
10839                         up_write(&space_info->groups_sem);
10840                         goto next;
10841                 }
10842                 spin_unlock(&block_group->lock);
10843
10844                 /* We don't want to force the issue, only flip if it's ok. */
10845                 ret = inc_block_group_ro(block_group, 0);
10846                 up_write(&space_info->groups_sem);
10847                 if (ret < 0) {
10848                         ret = 0;
10849                         goto next;
10850                 }
10851
10852                 /*
10853                  * Want to do this before we do anything else so we can recover
10854                  * properly if we fail to join the transaction.
10855                  */
10856                 trans = btrfs_start_trans_remove_block_group(fs_info,
10857                                                      block_group->key.objectid);
10858                 if (IS_ERR(trans)) {
10859                         btrfs_dec_block_group_ro(block_group);
10860                         ret = PTR_ERR(trans);
10861                         goto next;
10862                 }
10863
10864                 /*
10865                  * We could have pending pinned extents for this block group,
10866                  * just delete them, we don't care about them anymore.
10867                  */
10868                 start = block_group->key.objectid;
10869                 end = start + block_group->key.offset - 1;
10870                 /*
10871                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10872                  * btrfs_finish_extent_commit(). If we are at transaction N,
10873                  * another task might be running finish_extent_commit() for the
10874                  * previous transaction N - 1, and have seen a range belonging
10875                  * to the block group in freed_extents[] before we were able to
10876                  * clear the whole block group range from freed_extents[]. This
10877                  * means that task can lookup for the block group after we
10878                  * unpinned it from freed_extents[] and removed it, leading to
10879                  * a BUG_ON() at btrfs_unpin_extent_range().
10880                  */
10881                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10882                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10883                                   EXTENT_DIRTY);
10884                 if (ret) {
10885                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10886                         btrfs_dec_block_group_ro(block_group);
10887                         goto end_trans;
10888                 }
10889                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10890                                   EXTENT_DIRTY);
10891                 if (ret) {
10892                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10893                         btrfs_dec_block_group_ro(block_group);
10894                         goto end_trans;
10895                 }
10896                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10897
10898                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10899                 spin_lock(&space_info->lock);
10900                 spin_lock(&block_group->lock);
10901
10902                 update_bytes_pinned(fs_info, space_info, -block_group->pinned);
10903                 space_info->bytes_readonly += block_group->pinned;
10904                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
10905                                    -block_group->pinned,
10906                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
10907                 block_group->pinned = 0;
10908
10909                 spin_unlock(&block_group->lock);
10910                 spin_unlock(&space_info->lock);
10911
10912                 /* DISCARD can flip during remount */
10913                 trimming = btrfs_test_opt(fs_info, DISCARD);
10914
10915                 /* Implicit trim during transaction commit. */
10916                 if (trimming)
10917                         btrfs_get_block_group_trimming(block_group);
10918
10919                 /*
10920                  * Btrfs_remove_chunk will abort the transaction if things go
10921                  * horribly wrong.
10922                  */
10923                 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
10924
10925                 if (ret) {
10926                         if (trimming)
10927                                 btrfs_put_block_group_trimming(block_group);
10928                         goto end_trans;
10929                 }
10930
10931                 /*
10932                  * If we're not mounted with -odiscard, we can just forget
10933                  * about this block group. Otherwise we'll need to wait
10934                  * until transaction commit to do the actual discard.
10935                  */
10936                 if (trimming) {
10937                         spin_lock(&fs_info->unused_bgs_lock);
10938                         /*
10939                          * A concurrent scrub might have added us to the list
10940                          * fs_info->unused_bgs, so use a list_move operation
10941                          * to add the block group to the deleted_bgs list.
10942                          */
10943                         list_move(&block_group->bg_list,
10944                                   &trans->transaction->deleted_bgs);
10945                         spin_unlock(&fs_info->unused_bgs_lock);
10946                         btrfs_get_block_group(block_group);
10947                 }
10948 end_trans:
10949                 btrfs_end_transaction(trans);
10950 next:
10951                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10952                 btrfs_put_block_group(block_group);
10953                 spin_lock(&fs_info->unused_bgs_lock);
10954         }
10955         spin_unlock(&fs_info->unused_bgs_lock);
10956 }
10957
10958 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10959                                    u64 start, u64 end)
10960 {
10961         return unpin_extent_range(fs_info, start, end, false);
10962 }
10963
10964 /*
10965  * It used to be that old block groups would be left around forever.
10966  * Iterating over them would be enough to trim unused space.  Since we
10967  * now automatically remove them, we also need to iterate over unallocated
10968  * space.
10969  *
10970  * We don't want a transaction for this since the discard may take a
10971  * substantial amount of time.  We don't require that a transaction be
10972  * running, but we do need to take a running transaction into account
10973  * to ensure that we're not discarding chunks that were released or
10974  * allocated in the current transaction.
10975  *
10976  * Holding the chunks lock will prevent other threads from allocating
10977  * or releasing chunks, but it won't prevent a running transaction
10978  * from committing and releasing the memory that the pending chunks
10979  * list head uses.  For that, we need to take a reference to the
10980  * transaction and hold the commit root sem.  We only need to hold
10981  * it while performing the free space search since we have already
10982  * held back allocations.
10983  */
10984 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
10985 {
10986         u64 start = SZ_1M, len = 0, end = 0;
10987         int ret;
10988
10989         *trimmed = 0;
10990
10991         /* Discard not supported = nothing to do. */
10992         if (!blk_queue_discard(bdev_get_queue(device->bdev)))
10993                 return 0;
10994
10995         /* Not writable = nothing to do. */
10996         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10997                 return 0;
10998
10999         /* No free space = nothing to do. */
11000         if (device->total_bytes <= device->bytes_used)
11001                 return 0;
11002
11003         ret = 0;
11004
11005         while (1) {
11006                 struct btrfs_fs_info *fs_info = device->fs_info;
11007                 u64 bytes;
11008
11009                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11010                 if (ret)
11011                         break;
11012
11013                 find_first_clear_extent_bit(&device->alloc_state, start,
11014                                             &start, &end,
11015                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
11016
11017                 /* Ensure we skip the reserved area in the first 1M */
11018                 start = max_t(u64, start, SZ_1M);
11019
11020                 /*
11021                  * If find_first_clear_extent_bit find a range that spans the
11022                  * end of the device it will set end to -1, in this case it's up
11023                  * to the caller to trim the value to the size of the device.
11024                  */
11025                 end = min(end, device->total_bytes - 1);
11026
11027                 len = end - start + 1;
11028
11029                 /* We didn't find any extents */
11030                 if (!len) {
11031                         mutex_unlock(&fs_info->chunk_mutex);
11032                         ret = 0;
11033                         break;
11034                 }
11035
11036                 ret = btrfs_issue_discard(device->bdev, start, len,
11037                                           &bytes);
11038                 if (!ret)
11039                         set_extent_bits(&device->alloc_state, start,
11040                                         start + bytes - 1,
11041                                         CHUNK_TRIMMED);
11042                 mutex_unlock(&fs_info->chunk_mutex);
11043
11044                 if (ret)
11045                         break;
11046
11047                 start += len;
11048                 *trimmed += bytes;
11049
11050                 if (fatal_signal_pending(current)) {
11051                         ret = -ERESTARTSYS;
11052                         break;
11053                 }
11054
11055                 cond_resched();
11056         }
11057
11058         return ret;
11059 }
11060
11061 /*
11062  * Trim the whole filesystem by:
11063  * 1) trimming the free space in each block group
11064  * 2) trimming the unallocated space on each device
11065  *
11066  * This will also continue trimming even if a block group or device encounters
11067  * an error.  The return value will be the last error, or 0 if nothing bad
11068  * happens.
11069  */
11070 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
11071 {
11072         struct btrfs_block_group_cache *cache = NULL;
11073         struct btrfs_device *device;
11074         struct list_head *devices;
11075         u64 group_trimmed;
11076         u64 start;
11077         u64 end;
11078         u64 trimmed = 0;
11079         u64 bg_failed = 0;
11080         u64 dev_failed = 0;
11081         int bg_ret = 0;
11082         int dev_ret = 0;
11083         int ret = 0;
11084
11085         cache = btrfs_lookup_first_block_group(fs_info, range->start);
11086         for (; cache; cache = next_block_group(cache)) {
11087                 if (cache->key.objectid >= (range->start + range->len)) {
11088                         btrfs_put_block_group(cache);
11089                         break;
11090                 }
11091
11092                 start = max(range->start, cache->key.objectid);
11093                 end = min(range->start + range->len,
11094                                 cache->key.objectid + cache->key.offset);
11095
11096                 if (end - start >= range->minlen) {
11097                         if (!block_group_cache_done(cache)) {
11098                                 ret = cache_block_group(cache, 0);
11099                                 if (ret) {
11100                                         bg_failed++;
11101                                         bg_ret = ret;
11102                                         continue;
11103                                 }
11104                                 ret = wait_block_group_cache_done(cache);
11105                                 if (ret) {
11106                                         bg_failed++;
11107                                         bg_ret = ret;
11108                                         continue;
11109                                 }
11110                         }
11111                         ret = btrfs_trim_block_group(cache,
11112                                                      &group_trimmed,
11113                                                      start,
11114                                                      end,
11115                                                      range->minlen);
11116
11117                         trimmed += group_trimmed;
11118                         if (ret) {
11119                                 bg_failed++;
11120                                 bg_ret = ret;
11121                                 continue;
11122                         }
11123                 }
11124         }
11125
11126         if (bg_failed)
11127                 btrfs_warn(fs_info,
11128                         "failed to trim %llu block group(s), last error %d",
11129                         bg_failed, bg_ret);
11130         mutex_lock(&fs_info->fs_devices->device_list_mutex);
11131         devices = &fs_info->fs_devices->devices;
11132         list_for_each_entry(device, devices, dev_list) {
11133                 ret = btrfs_trim_free_extents(device, &group_trimmed);
11134                 if (ret) {
11135                         dev_failed++;
11136                         dev_ret = ret;
11137                         break;
11138                 }
11139
11140                 trimmed += group_trimmed;
11141         }
11142         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11143
11144         if (dev_failed)
11145                 btrfs_warn(fs_info,
11146                         "failed to trim %llu device(s), last error %d",
11147                         dev_failed, dev_ret);
11148         range->len = trimmed;
11149         if (bg_ret)
11150                 return bg_ret;
11151         return dev_ret;
11152 }
11153
11154 /*
11155  * btrfs_{start,end}_write_no_snapshotting() are similar to
11156  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11157  * data into the page cache through nocow before the subvolume is snapshoted,
11158  * but flush the data into disk after the snapshot creation, or to prevent
11159  * operations while snapshotting is ongoing and that cause the snapshot to be
11160  * inconsistent (writes followed by expanding truncates for example).
11161  */
11162 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11163 {
11164         percpu_counter_dec(&root->subv_writers->counter);
11165         cond_wake_up(&root->subv_writers->wait);
11166 }
11167
11168 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11169 {
11170         if (atomic_read(&root->will_be_snapshotted))
11171                 return 0;
11172
11173         percpu_counter_inc(&root->subv_writers->counter);
11174         /*
11175          * Make sure counter is updated before we check for snapshot creation.
11176          */
11177         smp_mb();
11178         if (atomic_read(&root->will_be_snapshotted)) {
11179                 btrfs_end_write_no_snapshotting(root);
11180                 return 0;
11181         }
11182         return 1;
11183 }
11184
11185 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11186 {
11187         while (true) {
11188                 int ret;
11189
11190                 ret = btrfs_start_write_no_snapshotting(root);
11191                 if (ret)
11192                         break;
11193                 wait_var_event(&root->will_be_snapshotted,
11194                                !atomic_read(&root->will_be_snapshotted));
11195         }
11196 }
11197
11198 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11199 {
11200         struct btrfs_fs_info *fs_info = bg->fs_info;
11201
11202         spin_lock(&fs_info->unused_bgs_lock);
11203         if (list_empty(&bg->bg_list)) {
11204                 btrfs_get_block_group(bg);
11205                 trace_btrfs_add_unused_block_group(bg);
11206                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11207         }
11208         spin_unlock(&fs_info->unused_bgs_lock);
11209 }