btrfs: get fs_info from trans in write_one_cache_group
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "tree-log.h"
20 #include "disk-io.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "locking.h"
25 #include "free-space-cache.h"
26 #include "free-space-tree.h"
27 #include "math.h"
28 #include "sysfs.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31
32 #undef SCRAMBLE_DELAYED_REFS
33
34 /*
35  * control flags for do_chunk_alloc's force field
36  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37  * if we really need one.
38  *
39  * CHUNK_ALLOC_LIMITED means to only try and allocate one
40  * if we have very few chunks already allocated.  This is
41  * used as part of the clustering code to help make sure
42  * we have a good pool of storage to cluster in, without
43  * filling the FS with empty chunks
44  *
45  * CHUNK_ALLOC_FORCE means it must try to allocate one
46  *
47  */
48 enum {
49         CHUNK_ALLOC_NO_FORCE = 0,
50         CHUNK_ALLOC_LIMITED = 1,
51         CHUNK_ALLOC_FORCE = 2,
52 };
53
54 /*
55  * Declare a helper function to detect underflow of various space info members
56  */
57 #define DECLARE_SPACE_INFO_UPDATE(name)                                 \
58 static inline void update_##name(struct btrfs_space_info *sinfo,        \
59                                  s64 bytes)                             \
60 {                                                                       \
61         if (bytes < 0 && sinfo->name < -bytes) {                        \
62                 WARN_ON(1);                                             \
63                 sinfo->name = 0;                                        \
64                 return;                                                 \
65         }                                                               \
66         sinfo->name += bytes;                                           \
67 }
68
69 DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
70 DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
71
72 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
73                                struct btrfs_delayed_ref_node *node, u64 parent,
74                                u64 root_objectid, u64 owner_objectid,
75                                u64 owner_offset, int refs_to_drop,
76                                struct btrfs_delayed_extent_op *extra_op);
77 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
78                                     struct extent_buffer *leaf,
79                                     struct btrfs_extent_item *ei);
80 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
81                                       u64 parent, u64 root_objectid,
82                                       u64 flags, u64 owner, u64 offset,
83                                       struct btrfs_key *ins, int ref_mod);
84 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
85                                      struct btrfs_delayed_ref_node *node,
86                                      struct btrfs_delayed_extent_op *extent_op);
87 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
88                           int force);
89 static int find_next_key(struct btrfs_path *path, int level,
90                          struct btrfs_key *key);
91 static void dump_space_info(struct btrfs_fs_info *fs_info,
92                             struct btrfs_space_info *info, u64 bytes,
93                             int dump_block_groups);
94 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
95                                u64 num_bytes);
96 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
97                                      struct btrfs_space_info *space_info,
98                                      u64 num_bytes);
99 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
100                                      struct btrfs_space_info *space_info,
101                                      u64 num_bytes);
102
103 static noinline int
104 block_group_cache_done(struct btrfs_block_group_cache *cache)
105 {
106         smp_mb();
107         return cache->cached == BTRFS_CACHE_FINISHED ||
108                 cache->cached == BTRFS_CACHE_ERROR;
109 }
110
111 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
112 {
113         return (cache->flags & bits) == bits;
114 }
115
116 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
117 {
118         atomic_inc(&cache->count);
119 }
120
121 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
122 {
123         if (atomic_dec_and_test(&cache->count)) {
124                 WARN_ON(cache->pinned > 0);
125                 WARN_ON(cache->reserved > 0);
126
127                 /*
128                  * If not empty, someone is still holding mutex of
129                  * full_stripe_lock, which can only be released by caller.
130                  * And it will definitely cause use-after-free when caller
131                  * tries to release full stripe lock.
132                  *
133                  * No better way to resolve, but only to warn.
134                  */
135                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
136                 kfree(cache->free_space_ctl);
137                 kfree(cache);
138         }
139 }
140
141 /*
142  * this adds the block group to the fs_info rb tree for the block group
143  * cache
144  */
145 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
146                                 struct btrfs_block_group_cache *block_group)
147 {
148         struct rb_node **p;
149         struct rb_node *parent = NULL;
150         struct btrfs_block_group_cache *cache;
151
152         spin_lock(&info->block_group_cache_lock);
153         p = &info->block_group_cache_tree.rb_node;
154
155         while (*p) {
156                 parent = *p;
157                 cache = rb_entry(parent, struct btrfs_block_group_cache,
158                                  cache_node);
159                 if (block_group->key.objectid < cache->key.objectid) {
160                         p = &(*p)->rb_left;
161                 } else if (block_group->key.objectid > cache->key.objectid) {
162                         p = &(*p)->rb_right;
163                 } else {
164                         spin_unlock(&info->block_group_cache_lock);
165                         return -EEXIST;
166                 }
167         }
168
169         rb_link_node(&block_group->cache_node, parent, p);
170         rb_insert_color(&block_group->cache_node,
171                         &info->block_group_cache_tree);
172
173         if (info->first_logical_byte > block_group->key.objectid)
174                 info->first_logical_byte = block_group->key.objectid;
175
176         spin_unlock(&info->block_group_cache_lock);
177
178         return 0;
179 }
180
181 /*
182  * This will return the block group at or after bytenr if contains is 0, else
183  * it will return the block group that contains the bytenr
184  */
185 static struct btrfs_block_group_cache *
186 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
187                               int contains)
188 {
189         struct btrfs_block_group_cache *cache, *ret = NULL;
190         struct rb_node *n;
191         u64 end, start;
192
193         spin_lock(&info->block_group_cache_lock);
194         n = info->block_group_cache_tree.rb_node;
195
196         while (n) {
197                 cache = rb_entry(n, struct btrfs_block_group_cache,
198                                  cache_node);
199                 end = cache->key.objectid + cache->key.offset - 1;
200                 start = cache->key.objectid;
201
202                 if (bytenr < start) {
203                         if (!contains && (!ret || start < ret->key.objectid))
204                                 ret = cache;
205                         n = n->rb_left;
206                 } else if (bytenr > start) {
207                         if (contains && bytenr <= end) {
208                                 ret = cache;
209                                 break;
210                         }
211                         n = n->rb_right;
212                 } else {
213                         ret = cache;
214                         break;
215                 }
216         }
217         if (ret) {
218                 btrfs_get_block_group(ret);
219                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
220                         info->first_logical_byte = ret->key.objectid;
221         }
222         spin_unlock(&info->block_group_cache_lock);
223
224         return ret;
225 }
226
227 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
228                                u64 start, u64 num_bytes)
229 {
230         u64 end = start + num_bytes - 1;
231         set_extent_bits(&fs_info->freed_extents[0],
232                         start, end, EXTENT_UPTODATE);
233         set_extent_bits(&fs_info->freed_extents[1],
234                         start, end, EXTENT_UPTODATE);
235         return 0;
236 }
237
238 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
239 {
240         struct btrfs_fs_info *fs_info = cache->fs_info;
241         u64 start, end;
242
243         start = cache->key.objectid;
244         end = start + cache->key.offset - 1;
245
246         clear_extent_bits(&fs_info->freed_extents[0],
247                           start, end, EXTENT_UPTODATE);
248         clear_extent_bits(&fs_info->freed_extents[1],
249                           start, end, EXTENT_UPTODATE);
250 }
251
252 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
253 {
254         struct btrfs_fs_info *fs_info = cache->fs_info;
255         u64 bytenr;
256         u64 *logical;
257         int stripe_len;
258         int i, nr, ret;
259
260         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
261                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
262                 cache->bytes_super += stripe_len;
263                 ret = add_excluded_extent(fs_info, cache->key.objectid,
264                                           stripe_len);
265                 if (ret)
266                         return ret;
267         }
268
269         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
270                 bytenr = btrfs_sb_offset(i);
271                 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
272                                        bytenr, &logical, &nr, &stripe_len);
273                 if (ret)
274                         return ret;
275
276                 while (nr--) {
277                         u64 start, len;
278
279                         if (logical[nr] > cache->key.objectid +
280                             cache->key.offset)
281                                 continue;
282
283                         if (logical[nr] + stripe_len <= cache->key.objectid)
284                                 continue;
285
286                         start = logical[nr];
287                         if (start < cache->key.objectid) {
288                                 start = cache->key.objectid;
289                                 len = (logical[nr] + stripe_len) - start;
290                         } else {
291                                 len = min_t(u64, stripe_len,
292                                             cache->key.objectid +
293                                             cache->key.offset - start);
294                         }
295
296                         cache->bytes_super += len;
297                         ret = add_excluded_extent(fs_info, start, len);
298                         if (ret) {
299                                 kfree(logical);
300                                 return ret;
301                         }
302                 }
303
304                 kfree(logical);
305         }
306         return 0;
307 }
308
309 static struct btrfs_caching_control *
310 get_caching_control(struct btrfs_block_group_cache *cache)
311 {
312         struct btrfs_caching_control *ctl;
313
314         spin_lock(&cache->lock);
315         if (!cache->caching_ctl) {
316                 spin_unlock(&cache->lock);
317                 return NULL;
318         }
319
320         ctl = cache->caching_ctl;
321         refcount_inc(&ctl->count);
322         spin_unlock(&cache->lock);
323         return ctl;
324 }
325
326 static void put_caching_control(struct btrfs_caching_control *ctl)
327 {
328         if (refcount_dec_and_test(&ctl->count))
329                 kfree(ctl);
330 }
331
332 #ifdef CONFIG_BTRFS_DEBUG
333 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
334 {
335         struct btrfs_fs_info *fs_info = block_group->fs_info;
336         u64 start = block_group->key.objectid;
337         u64 len = block_group->key.offset;
338         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
339                 fs_info->nodesize : fs_info->sectorsize;
340         u64 step = chunk << 1;
341
342         while (len > chunk) {
343                 btrfs_remove_free_space(block_group, start, chunk);
344                 start += step;
345                 if (len < step)
346                         len = 0;
347                 else
348                         len -= step;
349         }
350 }
351 #endif
352
353 /*
354  * this is only called by cache_block_group, since we could have freed extents
355  * we need to check the pinned_extents for any extents that can't be used yet
356  * since their free space will be released as soon as the transaction commits.
357  */
358 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
359                        u64 start, u64 end)
360 {
361         struct btrfs_fs_info *info = block_group->fs_info;
362         u64 extent_start, extent_end, size, total_added = 0;
363         int ret;
364
365         while (start < end) {
366                 ret = find_first_extent_bit(info->pinned_extents, start,
367                                             &extent_start, &extent_end,
368                                             EXTENT_DIRTY | EXTENT_UPTODATE,
369                                             NULL);
370                 if (ret)
371                         break;
372
373                 if (extent_start <= start) {
374                         start = extent_end + 1;
375                 } else if (extent_start > start && extent_start < end) {
376                         size = extent_start - start;
377                         total_added += size;
378                         ret = btrfs_add_free_space(block_group, start,
379                                                    size);
380                         BUG_ON(ret); /* -ENOMEM or logic error */
381                         start = extent_end + 1;
382                 } else {
383                         break;
384                 }
385         }
386
387         if (start < end) {
388                 size = end - start;
389                 total_added += size;
390                 ret = btrfs_add_free_space(block_group, start, size);
391                 BUG_ON(ret); /* -ENOMEM or logic error */
392         }
393
394         return total_added;
395 }
396
397 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
398 {
399         struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
400         struct btrfs_fs_info *fs_info = block_group->fs_info;
401         struct btrfs_root *extent_root = fs_info->extent_root;
402         struct btrfs_path *path;
403         struct extent_buffer *leaf;
404         struct btrfs_key key;
405         u64 total_found = 0;
406         u64 last = 0;
407         u32 nritems;
408         int ret;
409         bool wakeup = true;
410
411         path = btrfs_alloc_path();
412         if (!path)
413                 return -ENOMEM;
414
415         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
416
417 #ifdef CONFIG_BTRFS_DEBUG
418         /*
419          * If we're fragmenting we don't want to make anybody think we can
420          * allocate from this block group until we've had a chance to fragment
421          * the free space.
422          */
423         if (btrfs_should_fragment_free_space(block_group))
424                 wakeup = false;
425 #endif
426         /*
427          * We don't want to deadlock with somebody trying to allocate a new
428          * extent for the extent root while also trying to search the extent
429          * root to add free space.  So we skip locking and search the commit
430          * root, since its read-only
431          */
432         path->skip_locking = 1;
433         path->search_commit_root = 1;
434         path->reada = READA_FORWARD;
435
436         key.objectid = last;
437         key.offset = 0;
438         key.type = BTRFS_EXTENT_ITEM_KEY;
439
440 next:
441         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
442         if (ret < 0)
443                 goto out;
444
445         leaf = path->nodes[0];
446         nritems = btrfs_header_nritems(leaf);
447
448         while (1) {
449                 if (btrfs_fs_closing(fs_info) > 1) {
450                         last = (u64)-1;
451                         break;
452                 }
453
454                 if (path->slots[0] < nritems) {
455                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
456                 } else {
457                         ret = find_next_key(path, 0, &key);
458                         if (ret)
459                                 break;
460
461                         if (need_resched() ||
462                             rwsem_is_contended(&fs_info->commit_root_sem)) {
463                                 if (wakeup)
464                                         caching_ctl->progress = last;
465                                 btrfs_release_path(path);
466                                 up_read(&fs_info->commit_root_sem);
467                                 mutex_unlock(&caching_ctl->mutex);
468                                 cond_resched();
469                                 mutex_lock(&caching_ctl->mutex);
470                                 down_read(&fs_info->commit_root_sem);
471                                 goto next;
472                         }
473
474                         ret = btrfs_next_leaf(extent_root, path);
475                         if (ret < 0)
476                                 goto out;
477                         if (ret)
478                                 break;
479                         leaf = path->nodes[0];
480                         nritems = btrfs_header_nritems(leaf);
481                         continue;
482                 }
483
484                 if (key.objectid < last) {
485                         key.objectid = last;
486                         key.offset = 0;
487                         key.type = BTRFS_EXTENT_ITEM_KEY;
488
489                         if (wakeup)
490                                 caching_ctl->progress = last;
491                         btrfs_release_path(path);
492                         goto next;
493                 }
494
495                 if (key.objectid < block_group->key.objectid) {
496                         path->slots[0]++;
497                         continue;
498                 }
499
500                 if (key.objectid >= block_group->key.objectid +
501                     block_group->key.offset)
502                         break;
503
504                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
505                     key.type == BTRFS_METADATA_ITEM_KEY) {
506                         total_found += add_new_free_space(block_group, last,
507                                                           key.objectid);
508                         if (key.type == BTRFS_METADATA_ITEM_KEY)
509                                 last = key.objectid +
510                                         fs_info->nodesize;
511                         else
512                                 last = key.objectid + key.offset;
513
514                         if (total_found > CACHING_CTL_WAKE_UP) {
515                                 total_found = 0;
516                                 if (wakeup)
517                                         wake_up(&caching_ctl->wait);
518                         }
519                 }
520                 path->slots[0]++;
521         }
522         ret = 0;
523
524         total_found += add_new_free_space(block_group, last,
525                                           block_group->key.objectid +
526                                           block_group->key.offset);
527         caching_ctl->progress = (u64)-1;
528
529 out:
530         btrfs_free_path(path);
531         return ret;
532 }
533
534 static noinline void caching_thread(struct btrfs_work *work)
535 {
536         struct btrfs_block_group_cache *block_group;
537         struct btrfs_fs_info *fs_info;
538         struct btrfs_caching_control *caching_ctl;
539         int ret;
540
541         caching_ctl = container_of(work, struct btrfs_caching_control, work);
542         block_group = caching_ctl->block_group;
543         fs_info = block_group->fs_info;
544
545         mutex_lock(&caching_ctl->mutex);
546         down_read(&fs_info->commit_root_sem);
547
548         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
549                 ret = load_free_space_tree(caching_ctl);
550         else
551                 ret = load_extent_tree_free(caching_ctl);
552
553         spin_lock(&block_group->lock);
554         block_group->caching_ctl = NULL;
555         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
556         spin_unlock(&block_group->lock);
557
558 #ifdef CONFIG_BTRFS_DEBUG
559         if (btrfs_should_fragment_free_space(block_group)) {
560                 u64 bytes_used;
561
562                 spin_lock(&block_group->space_info->lock);
563                 spin_lock(&block_group->lock);
564                 bytes_used = block_group->key.offset -
565                         btrfs_block_group_used(&block_group->item);
566                 block_group->space_info->bytes_used += bytes_used >> 1;
567                 spin_unlock(&block_group->lock);
568                 spin_unlock(&block_group->space_info->lock);
569                 fragment_free_space(block_group);
570         }
571 #endif
572
573         caching_ctl->progress = (u64)-1;
574
575         up_read(&fs_info->commit_root_sem);
576         free_excluded_extents(block_group);
577         mutex_unlock(&caching_ctl->mutex);
578
579         wake_up(&caching_ctl->wait);
580
581         put_caching_control(caching_ctl);
582         btrfs_put_block_group(block_group);
583 }
584
585 static int cache_block_group(struct btrfs_block_group_cache *cache,
586                              int load_cache_only)
587 {
588         DEFINE_WAIT(wait);
589         struct btrfs_fs_info *fs_info = cache->fs_info;
590         struct btrfs_caching_control *caching_ctl;
591         int ret = 0;
592
593         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
594         if (!caching_ctl)
595                 return -ENOMEM;
596
597         INIT_LIST_HEAD(&caching_ctl->list);
598         mutex_init(&caching_ctl->mutex);
599         init_waitqueue_head(&caching_ctl->wait);
600         caching_ctl->block_group = cache;
601         caching_ctl->progress = cache->key.objectid;
602         refcount_set(&caching_ctl->count, 1);
603         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
604                         caching_thread, NULL, NULL);
605
606         spin_lock(&cache->lock);
607         /*
608          * This should be a rare occasion, but this could happen I think in the
609          * case where one thread starts to load the space cache info, and then
610          * some other thread starts a transaction commit which tries to do an
611          * allocation while the other thread is still loading the space cache
612          * info.  The previous loop should have kept us from choosing this block
613          * group, but if we've moved to the state where we will wait on caching
614          * block groups we need to first check if we're doing a fast load here,
615          * so we can wait for it to finish, otherwise we could end up allocating
616          * from a block group who's cache gets evicted for one reason or
617          * another.
618          */
619         while (cache->cached == BTRFS_CACHE_FAST) {
620                 struct btrfs_caching_control *ctl;
621
622                 ctl = cache->caching_ctl;
623                 refcount_inc(&ctl->count);
624                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
625                 spin_unlock(&cache->lock);
626
627                 schedule();
628
629                 finish_wait(&ctl->wait, &wait);
630                 put_caching_control(ctl);
631                 spin_lock(&cache->lock);
632         }
633
634         if (cache->cached != BTRFS_CACHE_NO) {
635                 spin_unlock(&cache->lock);
636                 kfree(caching_ctl);
637                 return 0;
638         }
639         WARN_ON(cache->caching_ctl);
640         cache->caching_ctl = caching_ctl;
641         cache->cached = BTRFS_CACHE_FAST;
642         spin_unlock(&cache->lock);
643
644         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
645                 mutex_lock(&caching_ctl->mutex);
646                 ret = load_free_space_cache(fs_info, cache);
647
648                 spin_lock(&cache->lock);
649                 if (ret == 1) {
650                         cache->caching_ctl = NULL;
651                         cache->cached = BTRFS_CACHE_FINISHED;
652                         cache->last_byte_to_unpin = (u64)-1;
653                         caching_ctl->progress = (u64)-1;
654                 } else {
655                         if (load_cache_only) {
656                                 cache->caching_ctl = NULL;
657                                 cache->cached = BTRFS_CACHE_NO;
658                         } else {
659                                 cache->cached = BTRFS_CACHE_STARTED;
660                                 cache->has_caching_ctl = 1;
661                         }
662                 }
663                 spin_unlock(&cache->lock);
664 #ifdef CONFIG_BTRFS_DEBUG
665                 if (ret == 1 &&
666                     btrfs_should_fragment_free_space(cache)) {
667                         u64 bytes_used;
668
669                         spin_lock(&cache->space_info->lock);
670                         spin_lock(&cache->lock);
671                         bytes_used = cache->key.offset -
672                                 btrfs_block_group_used(&cache->item);
673                         cache->space_info->bytes_used += bytes_used >> 1;
674                         spin_unlock(&cache->lock);
675                         spin_unlock(&cache->space_info->lock);
676                         fragment_free_space(cache);
677                 }
678 #endif
679                 mutex_unlock(&caching_ctl->mutex);
680
681                 wake_up(&caching_ctl->wait);
682                 if (ret == 1) {
683                         put_caching_control(caching_ctl);
684                         free_excluded_extents(cache);
685                         return 0;
686                 }
687         } else {
688                 /*
689                  * We're either using the free space tree or no caching at all.
690                  * Set cached to the appropriate value and wakeup any waiters.
691                  */
692                 spin_lock(&cache->lock);
693                 if (load_cache_only) {
694                         cache->caching_ctl = NULL;
695                         cache->cached = BTRFS_CACHE_NO;
696                 } else {
697                         cache->cached = BTRFS_CACHE_STARTED;
698                         cache->has_caching_ctl = 1;
699                 }
700                 spin_unlock(&cache->lock);
701                 wake_up(&caching_ctl->wait);
702         }
703
704         if (load_cache_only) {
705                 put_caching_control(caching_ctl);
706                 return 0;
707         }
708
709         down_write(&fs_info->commit_root_sem);
710         refcount_inc(&caching_ctl->count);
711         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
712         up_write(&fs_info->commit_root_sem);
713
714         btrfs_get_block_group(cache);
715
716         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
717
718         return ret;
719 }
720
721 /*
722  * return the block group that starts at or after bytenr
723  */
724 static struct btrfs_block_group_cache *
725 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
726 {
727         return block_group_cache_tree_search(info, bytenr, 0);
728 }
729
730 /*
731  * return the block group that contains the given bytenr
732  */
733 struct btrfs_block_group_cache *btrfs_lookup_block_group(
734                                                  struct btrfs_fs_info *info,
735                                                  u64 bytenr)
736 {
737         return block_group_cache_tree_search(info, bytenr, 1);
738 }
739
740 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
741                                                   u64 flags)
742 {
743         struct list_head *head = &info->space_info;
744         struct btrfs_space_info *found;
745
746         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
747
748         rcu_read_lock();
749         list_for_each_entry_rcu(found, head, list) {
750                 if (found->flags & flags) {
751                         rcu_read_unlock();
752                         return found;
753                 }
754         }
755         rcu_read_unlock();
756         return NULL;
757 }
758
759 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
760                              bool metadata, u64 root_objectid)
761 {
762         struct btrfs_space_info *space_info;
763         u64 flags;
764
765         if (metadata) {
766                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
767                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
768                 else
769                         flags = BTRFS_BLOCK_GROUP_METADATA;
770         } else {
771                 flags = BTRFS_BLOCK_GROUP_DATA;
772         }
773
774         space_info = __find_space_info(fs_info, flags);
775         ASSERT(space_info);
776         percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
777                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
778 }
779
780 /*
781  * after adding space to the filesystem, we need to clear the full flags
782  * on all the space infos.
783  */
784 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
785 {
786         struct list_head *head = &info->space_info;
787         struct btrfs_space_info *found;
788
789         rcu_read_lock();
790         list_for_each_entry_rcu(found, head, list)
791                 found->full = 0;
792         rcu_read_unlock();
793 }
794
795 /* simple helper to search for an existing data extent at a given offset */
796 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
797 {
798         int ret;
799         struct btrfs_key key;
800         struct btrfs_path *path;
801
802         path = btrfs_alloc_path();
803         if (!path)
804                 return -ENOMEM;
805
806         key.objectid = start;
807         key.offset = len;
808         key.type = BTRFS_EXTENT_ITEM_KEY;
809         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
810         btrfs_free_path(path);
811         return ret;
812 }
813
814 /*
815  * helper function to lookup reference count and flags of a tree block.
816  *
817  * the head node for delayed ref is used to store the sum of all the
818  * reference count modifications queued up in the rbtree. the head
819  * node may also store the extent flags to set. This way you can check
820  * to see what the reference count and extent flags would be if all of
821  * the delayed refs are not processed.
822  */
823 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
824                              struct btrfs_fs_info *fs_info, u64 bytenr,
825                              u64 offset, int metadata, u64 *refs, u64 *flags)
826 {
827         struct btrfs_delayed_ref_head *head;
828         struct btrfs_delayed_ref_root *delayed_refs;
829         struct btrfs_path *path;
830         struct btrfs_extent_item *ei;
831         struct extent_buffer *leaf;
832         struct btrfs_key key;
833         u32 item_size;
834         u64 num_refs;
835         u64 extent_flags;
836         int ret;
837
838         /*
839          * If we don't have skinny metadata, don't bother doing anything
840          * different
841          */
842         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
843                 offset = fs_info->nodesize;
844                 metadata = 0;
845         }
846
847         path = btrfs_alloc_path();
848         if (!path)
849                 return -ENOMEM;
850
851         if (!trans) {
852                 path->skip_locking = 1;
853                 path->search_commit_root = 1;
854         }
855
856 search_again:
857         key.objectid = bytenr;
858         key.offset = offset;
859         if (metadata)
860                 key.type = BTRFS_METADATA_ITEM_KEY;
861         else
862                 key.type = BTRFS_EXTENT_ITEM_KEY;
863
864         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
865         if (ret < 0)
866                 goto out_free;
867
868         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
869                 if (path->slots[0]) {
870                         path->slots[0]--;
871                         btrfs_item_key_to_cpu(path->nodes[0], &key,
872                                               path->slots[0]);
873                         if (key.objectid == bytenr &&
874                             key.type == BTRFS_EXTENT_ITEM_KEY &&
875                             key.offset == fs_info->nodesize)
876                                 ret = 0;
877                 }
878         }
879
880         if (ret == 0) {
881                 leaf = path->nodes[0];
882                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
883                 if (item_size >= sizeof(*ei)) {
884                         ei = btrfs_item_ptr(leaf, path->slots[0],
885                                             struct btrfs_extent_item);
886                         num_refs = btrfs_extent_refs(leaf, ei);
887                         extent_flags = btrfs_extent_flags(leaf, ei);
888                 } else {
889                         ret = -EINVAL;
890                         btrfs_print_v0_err(fs_info);
891                         if (trans)
892                                 btrfs_abort_transaction(trans, ret);
893                         else
894                                 btrfs_handle_fs_error(fs_info, ret, NULL);
895
896                         goto out_free;
897                 }
898
899                 BUG_ON(num_refs == 0);
900         } else {
901                 num_refs = 0;
902                 extent_flags = 0;
903                 ret = 0;
904         }
905
906         if (!trans)
907                 goto out;
908
909         delayed_refs = &trans->transaction->delayed_refs;
910         spin_lock(&delayed_refs->lock);
911         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
912         if (head) {
913                 if (!mutex_trylock(&head->mutex)) {
914                         refcount_inc(&head->refs);
915                         spin_unlock(&delayed_refs->lock);
916
917                         btrfs_release_path(path);
918
919                         /*
920                          * Mutex was contended, block until it's released and try
921                          * again
922                          */
923                         mutex_lock(&head->mutex);
924                         mutex_unlock(&head->mutex);
925                         btrfs_put_delayed_ref_head(head);
926                         goto search_again;
927                 }
928                 spin_lock(&head->lock);
929                 if (head->extent_op && head->extent_op->update_flags)
930                         extent_flags |= head->extent_op->flags_to_set;
931                 else
932                         BUG_ON(num_refs == 0);
933
934                 num_refs += head->ref_mod;
935                 spin_unlock(&head->lock);
936                 mutex_unlock(&head->mutex);
937         }
938         spin_unlock(&delayed_refs->lock);
939 out:
940         WARN_ON(num_refs == 0);
941         if (refs)
942                 *refs = num_refs;
943         if (flags)
944                 *flags = extent_flags;
945 out_free:
946         btrfs_free_path(path);
947         return ret;
948 }
949
950 /*
951  * Back reference rules.  Back refs have three main goals:
952  *
953  * 1) differentiate between all holders of references to an extent so that
954  *    when a reference is dropped we can make sure it was a valid reference
955  *    before freeing the extent.
956  *
957  * 2) Provide enough information to quickly find the holders of an extent
958  *    if we notice a given block is corrupted or bad.
959  *
960  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
961  *    maintenance.  This is actually the same as #2, but with a slightly
962  *    different use case.
963  *
964  * There are two kinds of back refs. The implicit back refs is optimized
965  * for pointers in non-shared tree blocks. For a given pointer in a block,
966  * back refs of this kind provide information about the block's owner tree
967  * and the pointer's key. These information allow us to find the block by
968  * b-tree searching. The full back refs is for pointers in tree blocks not
969  * referenced by their owner trees. The location of tree block is recorded
970  * in the back refs. Actually the full back refs is generic, and can be
971  * used in all cases the implicit back refs is used. The major shortcoming
972  * of the full back refs is its overhead. Every time a tree block gets
973  * COWed, we have to update back refs entry for all pointers in it.
974  *
975  * For a newly allocated tree block, we use implicit back refs for
976  * pointers in it. This means most tree related operations only involve
977  * implicit back refs. For a tree block created in old transaction, the
978  * only way to drop a reference to it is COW it. So we can detect the
979  * event that tree block loses its owner tree's reference and do the
980  * back refs conversion.
981  *
982  * When a tree block is COWed through a tree, there are four cases:
983  *
984  * The reference count of the block is one and the tree is the block's
985  * owner tree. Nothing to do in this case.
986  *
987  * The reference count of the block is one and the tree is not the
988  * block's owner tree. In this case, full back refs is used for pointers
989  * in the block. Remove these full back refs, add implicit back refs for
990  * every pointers in the new block.
991  *
992  * The reference count of the block is greater than one and the tree is
993  * the block's owner tree. In this case, implicit back refs is used for
994  * pointers in the block. Add full back refs for every pointers in the
995  * block, increase lower level extents' reference counts. The original
996  * implicit back refs are entailed to the new block.
997  *
998  * The reference count of the block is greater than one and the tree is
999  * not the block's owner tree. Add implicit back refs for every pointer in
1000  * the new block, increase lower level extents' reference count.
1001  *
1002  * Back Reference Key composing:
1003  *
1004  * The key objectid corresponds to the first byte in the extent,
1005  * The key type is used to differentiate between types of back refs.
1006  * There are different meanings of the key offset for different types
1007  * of back refs.
1008  *
1009  * File extents can be referenced by:
1010  *
1011  * - multiple snapshots, subvolumes, or different generations in one subvol
1012  * - different files inside a single subvolume
1013  * - different offsets inside a file (bookend extents in file.c)
1014  *
1015  * The extent ref structure for the implicit back refs has fields for:
1016  *
1017  * - Objectid of the subvolume root
1018  * - objectid of the file holding the reference
1019  * - original offset in the file
1020  * - how many bookend extents
1021  *
1022  * The key offset for the implicit back refs is hash of the first
1023  * three fields.
1024  *
1025  * The extent ref structure for the full back refs has field for:
1026  *
1027  * - number of pointers in the tree leaf
1028  *
1029  * The key offset for the implicit back refs is the first byte of
1030  * the tree leaf
1031  *
1032  * When a file extent is allocated, The implicit back refs is used.
1033  * the fields are filled in:
1034  *
1035  *     (root_key.objectid, inode objectid, offset in file, 1)
1036  *
1037  * When a file extent is removed file truncation, we find the
1038  * corresponding implicit back refs and check the following fields:
1039  *
1040  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1041  *
1042  * Btree extents can be referenced by:
1043  *
1044  * - Different subvolumes
1045  *
1046  * Both the implicit back refs and the full back refs for tree blocks
1047  * only consist of key. The key offset for the implicit back refs is
1048  * objectid of block's owner tree. The key offset for the full back refs
1049  * is the first byte of parent block.
1050  *
1051  * When implicit back refs is used, information about the lowest key and
1052  * level of the tree block are required. These information are stored in
1053  * tree block info structure.
1054  */
1055
1056 /*
1057  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1058  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
1059  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1060  */
1061 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1062                                      struct btrfs_extent_inline_ref *iref,
1063                                      enum btrfs_inline_ref_type is_data)
1064 {
1065         int type = btrfs_extent_inline_ref_type(eb, iref);
1066         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1067
1068         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1069             type == BTRFS_SHARED_BLOCK_REF_KEY ||
1070             type == BTRFS_SHARED_DATA_REF_KEY ||
1071             type == BTRFS_EXTENT_DATA_REF_KEY) {
1072                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1073                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
1074                                 return type;
1075                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1076                                 ASSERT(eb->fs_info);
1077                                 /*
1078                                  * Every shared one has parent tree
1079                                  * block, which must be aligned to
1080                                  * nodesize.
1081                                  */
1082                                 if (offset &&
1083                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1084                                         return type;
1085                         }
1086                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1087                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
1088                                 return type;
1089                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
1090                                 ASSERT(eb->fs_info);
1091                                 /*
1092                                  * Every shared one has parent tree
1093                                  * block, which must be aligned to
1094                                  * nodesize.
1095                                  */
1096                                 if (offset &&
1097                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1098                                         return type;
1099                         }
1100                 } else {
1101                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1102                         return type;
1103                 }
1104         }
1105
1106         btrfs_print_leaf((struct extent_buffer *)eb);
1107         btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1108                   eb->start, type);
1109         WARN_ON(1);
1110
1111         return BTRFS_REF_TYPE_INVALID;
1112 }
1113
1114 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1115 {
1116         u32 high_crc = ~(u32)0;
1117         u32 low_crc = ~(u32)0;
1118         __le64 lenum;
1119
1120         lenum = cpu_to_le64(root_objectid);
1121         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1122         lenum = cpu_to_le64(owner);
1123         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1124         lenum = cpu_to_le64(offset);
1125         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1126
1127         return ((u64)high_crc << 31) ^ (u64)low_crc;
1128 }
1129
1130 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1131                                      struct btrfs_extent_data_ref *ref)
1132 {
1133         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1134                                     btrfs_extent_data_ref_objectid(leaf, ref),
1135                                     btrfs_extent_data_ref_offset(leaf, ref));
1136 }
1137
1138 static int match_extent_data_ref(struct extent_buffer *leaf,
1139                                  struct btrfs_extent_data_ref *ref,
1140                                  u64 root_objectid, u64 owner, u64 offset)
1141 {
1142         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1143             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1144             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1145                 return 0;
1146         return 1;
1147 }
1148
1149 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1150                                            struct btrfs_path *path,
1151                                            u64 bytenr, u64 parent,
1152                                            u64 root_objectid,
1153                                            u64 owner, u64 offset)
1154 {
1155         struct btrfs_root *root = trans->fs_info->extent_root;
1156         struct btrfs_key key;
1157         struct btrfs_extent_data_ref *ref;
1158         struct extent_buffer *leaf;
1159         u32 nritems;
1160         int ret;
1161         int recow;
1162         int err = -ENOENT;
1163
1164         key.objectid = bytenr;
1165         if (parent) {
1166                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1167                 key.offset = parent;
1168         } else {
1169                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1170                 key.offset = hash_extent_data_ref(root_objectid,
1171                                                   owner, offset);
1172         }
1173 again:
1174         recow = 0;
1175         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1176         if (ret < 0) {
1177                 err = ret;
1178                 goto fail;
1179         }
1180
1181         if (parent) {
1182                 if (!ret)
1183                         return 0;
1184                 goto fail;
1185         }
1186
1187         leaf = path->nodes[0];
1188         nritems = btrfs_header_nritems(leaf);
1189         while (1) {
1190                 if (path->slots[0] >= nritems) {
1191                         ret = btrfs_next_leaf(root, path);
1192                         if (ret < 0)
1193                                 err = ret;
1194                         if (ret)
1195                                 goto fail;
1196
1197                         leaf = path->nodes[0];
1198                         nritems = btrfs_header_nritems(leaf);
1199                         recow = 1;
1200                 }
1201
1202                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1203                 if (key.objectid != bytenr ||
1204                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1205                         goto fail;
1206
1207                 ref = btrfs_item_ptr(leaf, path->slots[0],
1208                                      struct btrfs_extent_data_ref);
1209
1210                 if (match_extent_data_ref(leaf, ref, root_objectid,
1211                                           owner, offset)) {
1212                         if (recow) {
1213                                 btrfs_release_path(path);
1214                                 goto again;
1215                         }
1216                         err = 0;
1217                         break;
1218                 }
1219                 path->slots[0]++;
1220         }
1221 fail:
1222         return err;
1223 }
1224
1225 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1226                                            struct btrfs_path *path,
1227                                            u64 bytenr, u64 parent,
1228                                            u64 root_objectid, u64 owner,
1229                                            u64 offset, int refs_to_add)
1230 {
1231         struct btrfs_root *root = trans->fs_info->extent_root;
1232         struct btrfs_key key;
1233         struct extent_buffer *leaf;
1234         u32 size;
1235         u32 num_refs;
1236         int ret;
1237
1238         key.objectid = bytenr;
1239         if (parent) {
1240                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1241                 key.offset = parent;
1242                 size = sizeof(struct btrfs_shared_data_ref);
1243         } else {
1244                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1245                 key.offset = hash_extent_data_ref(root_objectid,
1246                                                   owner, offset);
1247                 size = sizeof(struct btrfs_extent_data_ref);
1248         }
1249
1250         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1251         if (ret && ret != -EEXIST)
1252                 goto fail;
1253
1254         leaf = path->nodes[0];
1255         if (parent) {
1256                 struct btrfs_shared_data_ref *ref;
1257                 ref = btrfs_item_ptr(leaf, path->slots[0],
1258                                      struct btrfs_shared_data_ref);
1259                 if (ret == 0) {
1260                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1261                 } else {
1262                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1263                         num_refs += refs_to_add;
1264                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1265                 }
1266         } else {
1267                 struct btrfs_extent_data_ref *ref;
1268                 while (ret == -EEXIST) {
1269                         ref = btrfs_item_ptr(leaf, path->slots[0],
1270                                              struct btrfs_extent_data_ref);
1271                         if (match_extent_data_ref(leaf, ref, root_objectid,
1272                                                   owner, offset))
1273                                 break;
1274                         btrfs_release_path(path);
1275                         key.offset++;
1276                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1277                                                       size);
1278                         if (ret && ret != -EEXIST)
1279                                 goto fail;
1280
1281                         leaf = path->nodes[0];
1282                 }
1283                 ref = btrfs_item_ptr(leaf, path->slots[0],
1284                                      struct btrfs_extent_data_ref);
1285                 if (ret == 0) {
1286                         btrfs_set_extent_data_ref_root(leaf, ref,
1287                                                        root_objectid);
1288                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1289                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1290                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1291                 } else {
1292                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1293                         num_refs += refs_to_add;
1294                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1295                 }
1296         }
1297         btrfs_mark_buffer_dirty(leaf);
1298         ret = 0;
1299 fail:
1300         btrfs_release_path(path);
1301         return ret;
1302 }
1303
1304 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1305                                            struct btrfs_path *path,
1306                                            int refs_to_drop, int *last_ref)
1307 {
1308         struct btrfs_key key;
1309         struct btrfs_extent_data_ref *ref1 = NULL;
1310         struct btrfs_shared_data_ref *ref2 = NULL;
1311         struct extent_buffer *leaf;
1312         u32 num_refs = 0;
1313         int ret = 0;
1314
1315         leaf = path->nodes[0];
1316         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1317
1318         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1319                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1320                                       struct btrfs_extent_data_ref);
1321                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1322         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1323                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1324                                       struct btrfs_shared_data_ref);
1325                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1326         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1327                 btrfs_print_v0_err(trans->fs_info);
1328                 btrfs_abort_transaction(trans, -EINVAL);
1329                 return -EINVAL;
1330         } else {
1331                 BUG();
1332         }
1333
1334         BUG_ON(num_refs < refs_to_drop);
1335         num_refs -= refs_to_drop;
1336
1337         if (num_refs == 0) {
1338                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1339                 *last_ref = 1;
1340         } else {
1341                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1342                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1343                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1344                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1345                 btrfs_mark_buffer_dirty(leaf);
1346         }
1347         return ret;
1348 }
1349
1350 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1351                                           struct btrfs_extent_inline_ref *iref)
1352 {
1353         struct btrfs_key key;
1354         struct extent_buffer *leaf;
1355         struct btrfs_extent_data_ref *ref1;
1356         struct btrfs_shared_data_ref *ref2;
1357         u32 num_refs = 0;
1358         int type;
1359
1360         leaf = path->nodes[0];
1361         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1362
1363         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1364         if (iref) {
1365                 /*
1366                  * If type is invalid, we should have bailed out earlier than
1367                  * this call.
1368                  */
1369                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1370                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1371                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1372                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1373                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1374                 } else {
1375                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1376                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1377                 }
1378         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1379                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1380                                       struct btrfs_extent_data_ref);
1381                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1382         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1383                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1384                                       struct btrfs_shared_data_ref);
1385                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1386         } else {
1387                 WARN_ON(1);
1388         }
1389         return num_refs;
1390 }
1391
1392 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1393                                           struct btrfs_path *path,
1394                                           u64 bytenr, u64 parent,
1395                                           u64 root_objectid)
1396 {
1397         struct btrfs_root *root = trans->fs_info->extent_root;
1398         struct btrfs_key key;
1399         int ret;
1400
1401         key.objectid = bytenr;
1402         if (parent) {
1403                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1404                 key.offset = parent;
1405         } else {
1406                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1407                 key.offset = root_objectid;
1408         }
1409
1410         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1411         if (ret > 0)
1412                 ret = -ENOENT;
1413         return ret;
1414 }
1415
1416 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1417                                           struct btrfs_path *path,
1418                                           u64 bytenr, u64 parent,
1419                                           u64 root_objectid)
1420 {
1421         struct btrfs_key key;
1422         int ret;
1423
1424         key.objectid = bytenr;
1425         if (parent) {
1426                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1427                 key.offset = parent;
1428         } else {
1429                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1430                 key.offset = root_objectid;
1431         }
1432
1433         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1434                                       path, &key, 0);
1435         btrfs_release_path(path);
1436         return ret;
1437 }
1438
1439 static inline int extent_ref_type(u64 parent, u64 owner)
1440 {
1441         int type;
1442         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1443                 if (parent > 0)
1444                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1445                 else
1446                         type = BTRFS_TREE_BLOCK_REF_KEY;
1447         } else {
1448                 if (parent > 0)
1449                         type = BTRFS_SHARED_DATA_REF_KEY;
1450                 else
1451                         type = BTRFS_EXTENT_DATA_REF_KEY;
1452         }
1453         return type;
1454 }
1455
1456 static int find_next_key(struct btrfs_path *path, int level,
1457                          struct btrfs_key *key)
1458
1459 {
1460         for (; level < BTRFS_MAX_LEVEL; level++) {
1461                 if (!path->nodes[level])
1462                         break;
1463                 if (path->slots[level] + 1 >=
1464                     btrfs_header_nritems(path->nodes[level]))
1465                         continue;
1466                 if (level == 0)
1467                         btrfs_item_key_to_cpu(path->nodes[level], key,
1468                                               path->slots[level] + 1);
1469                 else
1470                         btrfs_node_key_to_cpu(path->nodes[level], key,
1471                                               path->slots[level] + 1);
1472                 return 0;
1473         }
1474         return 1;
1475 }
1476
1477 /*
1478  * look for inline back ref. if back ref is found, *ref_ret is set
1479  * to the address of inline back ref, and 0 is returned.
1480  *
1481  * if back ref isn't found, *ref_ret is set to the address where it
1482  * should be inserted, and -ENOENT is returned.
1483  *
1484  * if insert is true and there are too many inline back refs, the path
1485  * points to the extent item, and -EAGAIN is returned.
1486  *
1487  * NOTE: inline back refs are ordered in the same way that back ref
1488  *       items in the tree are ordered.
1489  */
1490 static noinline_for_stack
1491 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1492                                  struct btrfs_path *path,
1493                                  struct btrfs_extent_inline_ref **ref_ret,
1494                                  u64 bytenr, u64 num_bytes,
1495                                  u64 parent, u64 root_objectid,
1496                                  u64 owner, u64 offset, int insert)
1497 {
1498         struct btrfs_fs_info *fs_info = trans->fs_info;
1499         struct btrfs_root *root = fs_info->extent_root;
1500         struct btrfs_key key;
1501         struct extent_buffer *leaf;
1502         struct btrfs_extent_item *ei;
1503         struct btrfs_extent_inline_ref *iref;
1504         u64 flags;
1505         u64 item_size;
1506         unsigned long ptr;
1507         unsigned long end;
1508         int extra_size;
1509         int type;
1510         int want;
1511         int ret;
1512         int err = 0;
1513         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1514         int needed;
1515
1516         key.objectid = bytenr;
1517         key.type = BTRFS_EXTENT_ITEM_KEY;
1518         key.offset = num_bytes;
1519
1520         want = extent_ref_type(parent, owner);
1521         if (insert) {
1522                 extra_size = btrfs_extent_inline_ref_size(want);
1523                 path->keep_locks = 1;
1524         } else
1525                 extra_size = -1;
1526
1527         /*
1528          * Owner is our level, so we can just add one to get the level for the
1529          * block we are interested in.
1530          */
1531         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1532                 key.type = BTRFS_METADATA_ITEM_KEY;
1533                 key.offset = owner;
1534         }
1535
1536 again:
1537         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1538         if (ret < 0) {
1539                 err = ret;
1540                 goto out;
1541         }
1542
1543         /*
1544          * We may be a newly converted file system which still has the old fat
1545          * extent entries for metadata, so try and see if we have one of those.
1546          */
1547         if (ret > 0 && skinny_metadata) {
1548                 skinny_metadata = false;
1549                 if (path->slots[0]) {
1550                         path->slots[0]--;
1551                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1552                                               path->slots[0]);
1553                         if (key.objectid == bytenr &&
1554                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1555                             key.offset == num_bytes)
1556                                 ret = 0;
1557                 }
1558                 if (ret) {
1559                         key.objectid = bytenr;
1560                         key.type = BTRFS_EXTENT_ITEM_KEY;
1561                         key.offset = num_bytes;
1562                         btrfs_release_path(path);
1563                         goto again;
1564                 }
1565         }
1566
1567         if (ret && !insert) {
1568                 err = -ENOENT;
1569                 goto out;
1570         } else if (WARN_ON(ret)) {
1571                 err = -EIO;
1572                 goto out;
1573         }
1574
1575         leaf = path->nodes[0];
1576         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1577         if (unlikely(item_size < sizeof(*ei))) {
1578                 err = -EINVAL;
1579                 btrfs_print_v0_err(fs_info);
1580                 btrfs_abort_transaction(trans, err);
1581                 goto out;
1582         }
1583
1584         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1585         flags = btrfs_extent_flags(leaf, ei);
1586
1587         ptr = (unsigned long)(ei + 1);
1588         end = (unsigned long)ei + item_size;
1589
1590         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1591                 ptr += sizeof(struct btrfs_tree_block_info);
1592                 BUG_ON(ptr > end);
1593         }
1594
1595         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1596                 needed = BTRFS_REF_TYPE_DATA;
1597         else
1598                 needed = BTRFS_REF_TYPE_BLOCK;
1599
1600         err = -ENOENT;
1601         while (1) {
1602                 if (ptr >= end) {
1603                         WARN_ON(ptr > end);
1604                         break;
1605                 }
1606                 iref = (struct btrfs_extent_inline_ref *)ptr;
1607                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1608                 if (type == BTRFS_REF_TYPE_INVALID) {
1609                         err = -EUCLEAN;
1610                         goto out;
1611                 }
1612
1613                 if (want < type)
1614                         break;
1615                 if (want > type) {
1616                         ptr += btrfs_extent_inline_ref_size(type);
1617                         continue;
1618                 }
1619
1620                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1621                         struct btrfs_extent_data_ref *dref;
1622                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1623                         if (match_extent_data_ref(leaf, dref, root_objectid,
1624                                                   owner, offset)) {
1625                                 err = 0;
1626                                 break;
1627                         }
1628                         if (hash_extent_data_ref_item(leaf, dref) <
1629                             hash_extent_data_ref(root_objectid, owner, offset))
1630                                 break;
1631                 } else {
1632                         u64 ref_offset;
1633                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1634                         if (parent > 0) {
1635                                 if (parent == ref_offset) {
1636                                         err = 0;
1637                                         break;
1638                                 }
1639                                 if (ref_offset < parent)
1640                                         break;
1641                         } else {
1642                                 if (root_objectid == ref_offset) {
1643                                         err = 0;
1644                                         break;
1645                                 }
1646                                 if (ref_offset < root_objectid)
1647                                         break;
1648                         }
1649                 }
1650                 ptr += btrfs_extent_inline_ref_size(type);
1651         }
1652         if (err == -ENOENT && insert) {
1653                 if (item_size + extra_size >=
1654                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1655                         err = -EAGAIN;
1656                         goto out;
1657                 }
1658                 /*
1659                  * To add new inline back ref, we have to make sure
1660                  * there is no corresponding back ref item.
1661                  * For simplicity, we just do not add new inline back
1662                  * ref if there is any kind of item for this block
1663                  */
1664                 if (find_next_key(path, 0, &key) == 0 &&
1665                     key.objectid == bytenr &&
1666                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1667                         err = -EAGAIN;
1668                         goto out;
1669                 }
1670         }
1671         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1672 out:
1673         if (insert) {
1674                 path->keep_locks = 0;
1675                 btrfs_unlock_up_safe(path, 1);
1676         }
1677         return err;
1678 }
1679
1680 /*
1681  * helper to add new inline back ref
1682  */
1683 static noinline_for_stack
1684 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1685                                  struct btrfs_path *path,
1686                                  struct btrfs_extent_inline_ref *iref,
1687                                  u64 parent, u64 root_objectid,
1688                                  u64 owner, u64 offset, int refs_to_add,
1689                                  struct btrfs_delayed_extent_op *extent_op)
1690 {
1691         struct extent_buffer *leaf;
1692         struct btrfs_extent_item *ei;
1693         unsigned long ptr;
1694         unsigned long end;
1695         unsigned long item_offset;
1696         u64 refs;
1697         int size;
1698         int type;
1699
1700         leaf = path->nodes[0];
1701         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1702         item_offset = (unsigned long)iref - (unsigned long)ei;
1703
1704         type = extent_ref_type(parent, owner);
1705         size = btrfs_extent_inline_ref_size(type);
1706
1707         btrfs_extend_item(fs_info, path, size);
1708
1709         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1710         refs = btrfs_extent_refs(leaf, ei);
1711         refs += refs_to_add;
1712         btrfs_set_extent_refs(leaf, ei, refs);
1713         if (extent_op)
1714                 __run_delayed_extent_op(extent_op, leaf, ei);
1715
1716         ptr = (unsigned long)ei + item_offset;
1717         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1718         if (ptr < end - size)
1719                 memmove_extent_buffer(leaf, ptr + size, ptr,
1720                                       end - size - ptr);
1721
1722         iref = (struct btrfs_extent_inline_ref *)ptr;
1723         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1724         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1725                 struct btrfs_extent_data_ref *dref;
1726                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1727                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1728                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1729                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1730                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1731         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1732                 struct btrfs_shared_data_ref *sref;
1733                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1734                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1735                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1736         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1737                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1738         } else {
1739                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1740         }
1741         btrfs_mark_buffer_dirty(leaf);
1742 }
1743
1744 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1745                                  struct btrfs_path *path,
1746                                  struct btrfs_extent_inline_ref **ref_ret,
1747                                  u64 bytenr, u64 num_bytes, u64 parent,
1748                                  u64 root_objectid, u64 owner, u64 offset)
1749 {
1750         int ret;
1751
1752         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1753                                            num_bytes, parent, root_objectid,
1754                                            owner, offset, 0);
1755         if (ret != -ENOENT)
1756                 return ret;
1757
1758         btrfs_release_path(path);
1759         *ref_ret = NULL;
1760
1761         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1762                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1763                                             root_objectid);
1764         } else {
1765                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1766                                              root_objectid, owner, offset);
1767         }
1768         return ret;
1769 }
1770
1771 /*
1772  * helper to update/remove inline back ref
1773  */
1774 static noinline_for_stack
1775 void update_inline_extent_backref(struct btrfs_path *path,
1776                                   struct btrfs_extent_inline_ref *iref,
1777                                   int refs_to_mod,
1778                                   struct btrfs_delayed_extent_op *extent_op,
1779                                   int *last_ref)
1780 {
1781         struct extent_buffer *leaf = path->nodes[0];
1782         struct btrfs_fs_info *fs_info = leaf->fs_info;
1783         struct btrfs_extent_item *ei;
1784         struct btrfs_extent_data_ref *dref = NULL;
1785         struct btrfs_shared_data_ref *sref = NULL;
1786         unsigned long ptr;
1787         unsigned long end;
1788         u32 item_size;
1789         int size;
1790         int type;
1791         u64 refs;
1792
1793         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1794         refs = btrfs_extent_refs(leaf, ei);
1795         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1796         refs += refs_to_mod;
1797         btrfs_set_extent_refs(leaf, ei, refs);
1798         if (extent_op)
1799                 __run_delayed_extent_op(extent_op, leaf, ei);
1800
1801         /*
1802          * If type is invalid, we should have bailed out after
1803          * lookup_inline_extent_backref().
1804          */
1805         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1806         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1807
1808         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1809                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1810                 refs = btrfs_extent_data_ref_count(leaf, dref);
1811         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1812                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1813                 refs = btrfs_shared_data_ref_count(leaf, sref);
1814         } else {
1815                 refs = 1;
1816                 BUG_ON(refs_to_mod != -1);
1817         }
1818
1819         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1820         refs += refs_to_mod;
1821
1822         if (refs > 0) {
1823                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1824                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1825                 else
1826                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1827         } else {
1828                 *last_ref = 1;
1829                 size =  btrfs_extent_inline_ref_size(type);
1830                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1831                 ptr = (unsigned long)iref;
1832                 end = (unsigned long)ei + item_size;
1833                 if (ptr + size < end)
1834                         memmove_extent_buffer(leaf, ptr, ptr + size,
1835                                               end - ptr - size);
1836                 item_size -= size;
1837                 btrfs_truncate_item(fs_info, path, item_size, 1);
1838         }
1839         btrfs_mark_buffer_dirty(leaf);
1840 }
1841
1842 static noinline_for_stack
1843 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1844                                  struct btrfs_path *path,
1845                                  u64 bytenr, u64 num_bytes, u64 parent,
1846                                  u64 root_objectid, u64 owner,
1847                                  u64 offset, int refs_to_add,
1848                                  struct btrfs_delayed_extent_op *extent_op)
1849 {
1850         struct btrfs_extent_inline_ref *iref;
1851         int ret;
1852
1853         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1854                                            num_bytes, parent, root_objectid,
1855                                            owner, offset, 1);
1856         if (ret == 0) {
1857                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1858                 update_inline_extent_backref(path, iref, refs_to_add,
1859                                              extent_op, NULL);
1860         } else if (ret == -ENOENT) {
1861                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1862                                             root_objectid, owner, offset,
1863                                             refs_to_add, extent_op);
1864                 ret = 0;
1865         }
1866         return ret;
1867 }
1868
1869 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1870                                  struct btrfs_path *path,
1871                                  u64 bytenr, u64 parent, u64 root_objectid,
1872                                  u64 owner, u64 offset, int refs_to_add)
1873 {
1874         int ret;
1875         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1876                 BUG_ON(refs_to_add != 1);
1877                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1878                                             root_objectid);
1879         } else {
1880                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1881                                              root_objectid, owner, offset,
1882                                              refs_to_add);
1883         }
1884         return ret;
1885 }
1886
1887 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1888                                  struct btrfs_path *path,
1889                                  struct btrfs_extent_inline_ref *iref,
1890                                  int refs_to_drop, int is_data, int *last_ref)
1891 {
1892         int ret = 0;
1893
1894         BUG_ON(!is_data && refs_to_drop != 1);
1895         if (iref) {
1896                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1897                                              last_ref);
1898         } else if (is_data) {
1899                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1900                                              last_ref);
1901         } else {
1902                 *last_ref = 1;
1903                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1904         }
1905         return ret;
1906 }
1907
1908 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1909                                u64 *discarded_bytes)
1910 {
1911         int j, ret = 0;
1912         u64 bytes_left, end;
1913         u64 aligned_start = ALIGN(start, 1 << 9);
1914
1915         if (WARN_ON(start != aligned_start)) {
1916                 len -= aligned_start - start;
1917                 len = round_down(len, 1 << 9);
1918                 start = aligned_start;
1919         }
1920
1921         *discarded_bytes = 0;
1922
1923         if (!len)
1924                 return 0;
1925
1926         end = start + len;
1927         bytes_left = len;
1928
1929         /* Skip any superblocks on this device. */
1930         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1931                 u64 sb_start = btrfs_sb_offset(j);
1932                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1933                 u64 size = sb_start - start;
1934
1935                 if (!in_range(sb_start, start, bytes_left) &&
1936                     !in_range(sb_end, start, bytes_left) &&
1937                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1938                         continue;
1939
1940                 /*
1941                  * Superblock spans beginning of range.  Adjust start and
1942                  * try again.
1943                  */
1944                 if (sb_start <= start) {
1945                         start += sb_end - start;
1946                         if (start > end) {
1947                                 bytes_left = 0;
1948                                 break;
1949                         }
1950                         bytes_left = end - start;
1951                         continue;
1952                 }
1953
1954                 if (size) {
1955                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1956                                                    GFP_NOFS, 0);
1957                         if (!ret)
1958                                 *discarded_bytes += size;
1959                         else if (ret != -EOPNOTSUPP)
1960                                 return ret;
1961                 }
1962
1963                 start = sb_end;
1964                 if (start > end) {
1965                         bytes_left = 0;
1966                         break;
1967                 }
1968                 bytes_left = end - start;
1969         }
1970
1971         if (bytes_left) {
1972                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1973                                            GFP_NOFS, 0);
1974                 if (!ret)
1975                         *discarded_bytes += bytes_left;
1976         }
1977         return ret;
1978 }
1979
1980 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1981                          u64 num_bytes, u64 *actual_bytes)
1982 {
1983         int ret;
1984         u64 discarded_bytes = 0;
1985         struct btrfs_bio *bbio = NULL;
1986
1987
1988         /*
1989          * Avoid races with device replace and make sure our bbio has devices
1990          * associated to its stripes that don't go away while we are discarding.
1991          */
1992         btrfs_bio_counter_inc_blocked(fs_info);
1993         /* Tell the block device(s) that the sectors can be discarded */
1994         ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1995                               &bbio, 0);
1996         /* Error condition is -ENOMEM */
1997         if (!ret) {
1998                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1999                 int i;
2000
2001
2002                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2003                         u64 bytes;
2004                         struct request_queue *req_q;
2005
2006                         if (!stripe->dev->bdev) {
2007                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
2008                                 continue;
2009                         }
2010                         req_q = bdev_get_queue(stripe->dev->bdev);
2011                         if (!blk_queue_discard(req_q))
2012                                 continue;
2013
2014                         ret = btrfs_issue_discard(stripe->dev->bdev,
2015                                                   stripe->physical,
2016                                                   stripe->length,
2017                                                   &bytes);
2018                         if (!ret)
2019                                 discarded_bytes += bytes;
2020                         else if (ret != -EOPNOTSUPP)
2021                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2022
2023                         /*
2024                          * Just in case we get back EOPNOTSUPP for some reason,
2025                          * just ignore the return value so we don't screw up
2026                          * people calling discard_extent.
2027                          */
2028                         ret = 0;
2029                 }
2030                 btrfs_put_bbio(bbio);
2031         }
2032         btrfs_bio_counter_dec(fs_info);
2033
2034         if (actual_bytes)
2035                 *actual_bytes = discarded_bytes;
2036
2037
2038         if (ret == -EOPNOTSUPP)
2039                 ret = 0;
2040         return ret;
2041 }
2042
2043 /* Can return -ENOMEM */
2044 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2045                          struct btrfs_root *root,
2046                          u64 bytenr, u64 num_bytes, u64 parent,
2047                          u64 root_objectid, u64 owner, u64 offset)
2048 {
2049         struct btrfs_fs_info *fs_info = root->fs_info;
2050         int old_ref_mod, new_ref_mod;
2051         int ret;
2052
2053         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2054                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2055
2056         btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2057                            owner, offset, BTRFS_ADD_DELAYED_REF);
2058
2059         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2060                 ret = btrfs_add_delayed_tree_ref(trans, bytenr,
2061                                                  num_bytes, parent,
2062                                                  root_objectid, (int)owner,
2063                                                  BTRFS_ADD_DELAYED_REF, NULL,
2064                                                  &old_ref_mod, &new_ref_mod);
2065         } else {
2066                 ret = btrfs_add_delayed_data_ref(trans, bytenr,
2067                                                  num_bytes, parent,
2068                                                  root_objectid, owner, offset,
2069                                                  0, BTRFS_ADD_DELAYED_REF,
2070                                                  &old_ref_mod, &new_ref_mod);
2071         }
2072
2073         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2074                 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2075
2076                 add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2077         }
2078
2079         return ret;
2080 }
2081
2082 /*
2083  * __btrfs_inc_extent_ref - insert backreference for a given extent
2084  *
2085  * @trans:          Handle of transaction
2086  *
2087  * @node:           The delayed ref node used to get the bytenr/length for
2088  *                  extent whose references are incremented.
2089  *
2090  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2091  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2092  *                  bytenr of the parent block. Since new extents are always
2093  *                  created with indirect references, this will only be the case
2094  *                  when relocating a shared extent. In that case, root_objectid
2095  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2096  *                  be 0
2097  *
2098  * @root_objectid:  The id of the root where this modification has originated,
2099  *                  this can be either one of the well-known metadata trees or
2100  *                  the subvolume id which references this extent.
2101  *
2102  * @owner:          For data extents it is the inode number of the owning file.
2103  *                  For metadata extents this parameter holds the level in the
2104  *                  tree of the extent.
2105  *
2106  * @offset:         For metadata extents the offset is ignored and is currently
2107  *                  always passed as 0. For data extents it is the fileoffset
2108  *                  this extent belongs to.
2109  *
2110  * @refs_to_add     Number of references to add
2111  *
2112  * @extent_op       Pointer to a structure, holding information necessary when
2113  *                  updating a tree block's flags
2114  *
2115  */
2116 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2117                                   struct btrfs_delayed_ref_node *node,
2118                                   u64 parent, u64 root_objectid,
2119                                   u64 owner, u64 offset, int refs_to_add,
2120                                   struct btrfs_delayed_extent_op *extent_op)
2121 {
2122         struct btrfs_path *path;
2123         struct extent_buffer *leaf;
2124         struct btrfs_extent_item *item;
2125         struct btrfs_key key;
2126         u64 bytenr = node->bytenr;
2127         u64 num_bytes = node->num_bytes;
2128         u64 refs;
2129         int ret;
2130
2131         path = btrfs_alloc_path();
2132         if (!path)
2133                 return -ENOMEM;
2134
2135         path->reada = READA_FORWARD;
2136         path->leave_spinning = 1;
2137         /* this will setup the path even if it fails to insert the back ref */
2138         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2139                                            parent, root_objectid, owner,
2140                                            offset, refs_to_add, extent_op);
2141         if ((ret < 0 && ret != -EAGAIN) || !ret)
2142                 goto out;
2143
2144         /*
2145          * Ok we had -EAGAIN which means we didn't have space to insert and
2146          * inline extent ref, so just update the reference count and add a
2147          * normal backref.
2148          */
2149         leaf = path->nodes[0];
2150         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2151         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2152         refs = btrfs_extent_refs(leaf, item);
2153         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2154         if (extent_op)
2155                 __run_delayed_extent_op(extent_op, leaf, item);
2156
2157         btrfs_mark_buffer_dirty(leaf);
2158         btrfs_release_path(path);
2159
2160         path->reada = READA_FORWARD;
2161         path->leave_spinning = 1;
2162         /* now insert the actual backref */
2163         ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2164                                     owner, offset, refs_to_add);
2165         if (ret)
2166                 btrfs_abort_transaction(trans, ret);
2167 out:
2168         btrfs_free_path(path);
2169         return ret;
2170 }
2171
2172 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2173                                 struct btrfs_delayed_ref_node *node,
2174                                 struct btrfs_delayed_extent_op *extent_op,
2175                                 int insert_reserved)
2176 {
2177         int ret = 0;
2178         struct btrfs_delayed_data_ref *ref;
2179         struct btrfs_key ins;
2180         u64 parent = 0;
2181         u64 ref_root = 0;
2182         u64 flags = 0;
2183
2184         ins.objectid = node->bytenr;
2185         ins.offset = node->num_bytes;
2186         ins.type = BTRFS_EXTENT_ITEM_KEY;
2187
2188         ref = btrfs_delayed_node_to_data_ref(node);
2189         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2190
2191         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2192                 parent = ref->parent;
2193         ref_root = ref->root;
2194
2195         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2196                 if (extent_op)
2197                         flags |= extent_op->flags_to_set;
2198                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2199                                                  flags, ref->objectid,
2200                                                  ref->offset, &ins,
2201                                                  node->ref_mod);
2202         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2203                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2204                                              ref->objectid, ref->offset,
2205                                              node->ref_mod, extent_op);
2206         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2207                 ret = __btrfs_free_extent(trans, node, parent,
2208                                           ref_root, ref->objectid,
2209                                           ref->offset, node->ref_mod,
2210                                           extent_op);
2211         } else {
2212                 BUG();
2213         }
2214         return ret;
2215 }
2216
2217 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2218                                     struct extent_buffer *leaf,
2219                                     struct btrfs_extent_item *ei)
2220 {
2221         u64 flags = btrfs_extent_flags(leaf, ei);
2222         if (extent_op->update_flags) {
2223                 flags |= extent_op->flags_to_set;
2224                 btrfs_set_extent_flags(leaf, ei, flags);
2225         }
2226
2227         if (extent_op->update_key) {
2228                 struct btrfs_tree_block_info *bi;
2229                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2230                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2231                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2232         }
2233 }
2234
2235 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2236                                  struct btrfs_delayed_ref_head *head,
2237                                  struct btrfs_delayed_extent_op *extent_op)
2238 {
2239         struct btrfs_fs_info *fs_info = trans->fs_info;
2240         struct btrfs_key key;
2241         struct btrfs_path *path;
2242         struct btrfs_extent_item *ei;
2243         struct extent_buffer *leaf;
2244         u32 item_size;
2245         int ret;
2246         int err = 0;
2247         int metadata = !extent_op->is_data;
2248
2249         if (trans->aborted)
2250                 return 0;
2251
2252         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2253                 metadata = 0;
2254
2255         path = btrfs_alloc_path();
2256         if (!path)
2257                 return -ENOMEM;
2258
2259         key.objectid = head->bytenr;
2260
2261         if (metadata) {
2262                 key.type = BTRFS_METADATA_ITEM_KEY;
2263                 key.offset = extent_op->level;
2264         } else {
2265                 key.type = BTRFS_EXTENT_ITEM_KEY;
2266                 key.offset = head->num_bytes;
2267         }
2268
2269 again:
2270         path->reada = READA_FORWARD;
2271         path->leave_spinning = 1;
2272         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2273         if (ret < 0) {
2274                 err = ret;
2275                 goto out;
2276         }
2277         if (ret > 0) {
2278                 if (metadata) {
2279                         if (path->slots[0] > 0) {
2280                                 path->slots[0]--;
2281                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2282                                                       path->slots[0]);
2283                                 if (key.objectid == head->bytenr &&
2284                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2285                                     key.offset == head->num_bytes)
2286                                         ret = 0;
2287                         }
2288                         if (ret > 0) {
2289                                 btrfs_release_path(path);
2290                                 metadata = 0;
2291
2292                                 key.objectid = head->bytenr;
2293                                 key.offset = head->num_bytes;
2294                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2295                                 goto again;
2296                         }
2297                 } else {
2298                         err = -EIO;
2299                         goto out;
2300                 }
2301         }
2302
2303         leaf = path->nodes[0];
2304         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2305
2306         if (unlikely(item_size < sizeof(*ei))) {
2307                 err = -EINVAL;
2308                 btrfs_print_v0_err(fs_info);
2309                 btrfs_abort_transaction(trans, err);
2310                 goto out;
2311         }
2312
2313         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2314         __run_delayed_extent_op(extent_op, leaf, ei);
2315
2316         btrfs_mark_buffer_dirty(leaf);
2317 out:
2318         btrfs_free_path(path);
2319         return err;
2320 }
2321
2322 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2323                                 struct btrfs_delayed_ref_node *node,
2324                                 struct btrfs_delayed_extent_op *extent_op,
2325                                 int insert_reserved)
2326 {
2327         int ret = 0;
2328         struct btrfs_delayed_tree_ref *ref;
2329         u64 parent = 0;
2330         u64 ref_root = 0;
2331
2332         ref = btrfs_delayed_node_to_tree_ref(node);
2333         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2334
2335         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2336                 parent = ref->parent;
2337         ref_root = ref->root;
2338
2339         if (node->ref_mod != 1) {
2340                 btrfs_err(trans->fs_info,
2341         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2342                           node->bytenr, node->ref_mod, node->action, ref_root,
2343                           parent);
2344                 return -EIO;
2345         }
2346         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2347                 BUG_ON(!extent_op || !extent_op->update_flags);
2348                 ret = alloc_reserved_tree_block(trans, node, extent_op);
2349         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2350                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2351                                              ref->level, 0, 1, extent_op);
2352         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2353                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2354                                           ref->level, 0, 1, extent_op);
2355         } else {
2356                 BUG();
2357         }
2358         return ret;
2359 }
2360
2361 /* helper function to actually process a single delayed ref entry */
2362 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2363                                struct btrfs_delayed_ref_node *node,
2364                                struct btrfs_delayed_extent_op *extent_op,
2365                                int insert_reserved)
2366 {
2367         int ret = 0;
2368
2369         if (trans->aborted) {
2370                 if (insert_reserved)
2371                         btrfs_pin_extent(trans->fs_info, node->bytenr,
2372                                          node->num_bytes, 1);
2373                 return 0;
2374         }
2375
2376         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2377             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2378                 ret = run_delayed_tree_ref(trans, node, extent_op,
2379                                            insert_reserved);
2380         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2381                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2382                 ret = run_delayed_data_ref(trans, node, extent_op,
2383                                            insert_reserved);
2384         else
2385                 BUG();
2386         if (ret && insert_reserved)
2387                 btrfs_pin_extent(trans->fs_info, node->bytenr,
2388                                  node->num_bytes, 1);
2389         return ret;
2390 }
2391
2392 static inline struct btrfs_delayed_ref_node *
2393 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2394 {
2395         struct btrfs_delayed_ref_node *ref;
2396
2397         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2398                 return NULL;
2399
2400         /*
2401          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2402          * This is to prevent a ref count from going down to zero, which deletes
2403          * the extent item from the extent tree, when there still are references
2404          * to add, which would fail because they would not find the extent item.
2405          */
2406         if (!list_empty(&head->ref_add_list))
2407                 return list_first_entry(&head->ref_add_list,
2408                                 struct btrfs_delayed_ref_node, add_list);
2409
2410         ref = rb_entry(rb_first_cached(&head->ref_tree),
2411                        struct btrfs_delayed_ref_node, ref_node);
2412         ASSERT(list_empty(&ref->add_list));
2413         return ref;
2414 }
2415
2416 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2417                                       struct btrfs_delayed_ref_head *head)
2418 {
2419         spin_lock(&delayed_refs->lock);
2420         head->processing = 0;
2421         delayed_refs->num_heads_ready++;
2422         spin_unlock(&delayed_refs->lock);
2423         btrfs_delayed_ref_unlock(head);
2424 }
2425
2426 static struct btrfs_delayed_extent_op *cleanup_extent_op(
2427                                 struct btrfs_delayed_ref_head *head)
2428 {
2429         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2430
2431         if (!extent_op)
2432                 return NULL;
2433
2434         if (head->must_insert_reserved) {
2435                 head->extent_op = NULL;
2436                 btrfs_free_delayed_extent_op(extent_op);
2437                 return NULL;
2438         }
2439         return extent_op;
2440 }
2441
2442 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2443                                      struct btrfs_delayed_ref_head *head)
2444 {
2445         struct btrfs_delayed_extent_op *extent_op;
2446         int ret;
2447
2448         extent_op = cleanup_extent_op(head);
2449         if (!extent_op)
2450                 return 0;
2451         head->extent_op = NULL;
2452         spin_unlock(&head->lock);
2453         ret = run_delayed_extent_op(trans, head, extent_op);
2454         btrfs_free_delayed_extent_op(extent_op);
2455         return ret ? ret : 1;
2456 }
2457
2458 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2459                                   struct btrfs_delayed_ref_root *delayed_refs,
2460                                   struct btrfs_delayed_ref_head *head)
2461 {
2462         int nr_items = 1;       /* Dropping this ref head update. */
2463
2464         if (head->total_ref_mod < 0) {
2465                 struct btrfs_space_info *space_info;
2466                 u64 flags;
2467
2468                 if (head->is_data)
2469                         flags = BTRFS_BLOCK_GROUP_DATA;
2470                 else if (head->is_system)
2471                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
2472                 else
2473                         flags = BTRFS_BLOCK_GROUP_METADATA;
2474                 space_info = __find_space_info(fs_info, flags);
2475                 ASSERT(space_info);
2476                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2477                                    -head->num_bytes,
2478                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
2479
2480                 /*
2481                  * We had csum deletions accounted for in our delayed refs rsv,
2482                  * we need to drop the csum leaves for this update from our
2483                  * delayed_refs_rsv.
2484                  */
2485                 if (head->is_data) {
2486                         spin_lock(&delayed_refs->lock);
2487                         delayed_refs->pending_csums -= head->num_bytes;
2488                         spin_unlock(&delayed_refs->lock);
2489                         nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2490                                 head->num_bytes);
2491                 }
2492         }
2493
2494         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2495 }
2496
2497 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2498                             struct btrfs_delayed_ref_head *head)
2499 {
2500
2501         struct btrfs_fs_info *fs_info = trans->fs_info;
2502         struct btrfs_delayed_ref_root *delayed_refs;
2503         int ret;
2504
2505         delayed_refs = &trans->transaction->delayed_refs;
2506
2507         ret = run_and_cleanup_extent_op(trans, head);
2508         if (ret < 0) {
2509                 unselect_delayed_ref_head(delayed_refs, head);
2510                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2511                 return ret;
2512         } else if (ret) {
2513                 return ret;
2514         }
2515
2516         /*
2517          * Need to drop our head ref lock and re-acquire the delayed ref lock
2518          * and then re-check to make sure nobody got added.
2519          */
2520         spin_unlock(&head->lock);
2521         spin_lock(&delayed_refs->lock);
2522         spin_lock(&head->lock);
2523         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2524                 spin_unlock(&head->lock);
2525                 spin_unlock(&delayed_refs->lock);
2526                 return 1;
2527         }
2528         btrfs_delete_ref_head(delayed_refs, head);
2529         spin_unlock(&head->lock);
2530         spin_unlock(&delayed_refs->lock);
2531
2532         if (head->must_insert_reserved) {
2533                 btrfs_pin_extent(fs_info, head->bytenr,
2534                                  head->num_bytes, 1);
2535                 if (head->is_data) {
2536                         ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2537                                               head->num_bytes);
2538                 }
2539         }
2540
2541         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2542
2543         trace_run_delayed_ref_head(fs_info, head, 0);
2544         btrfs_delayed_ref_unlock(head);
2545         btrfs_put_delayed_ref_head(head);
2546         return 0;
2547 }
2548
2549 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2550                                         struct btrfs_trans_handle *trans)
2551 {
2552         struct btrfs_delayed_ref_root *delayed_refs =
2553                 &trans->transaction->delayed_refs;
2554         struct btrfs_delayed_ref_head *head = NULL;
2555         int ret;
2556
2557         spin_lock(&delayed_refs->lock);
2558         head = btrfs_select_ref_head(delayed_refs);
2559         if (!head) {
2560                 spin_unlock(&delayed_refs->lock);
2561                 return head;
2562         }
2563
2564         /*
2565          * Grab the lock that says we are going to process all the refs for
2566          * this head
2567          */
2568         ret = btrfs_delayed_ref_lock(delayed_refs, head);
2569         spin_unlock(&delayed_refs->lock);
2570
2571         /*
2572          * We may have dropped the spin lock to get the head mutex lock, and
2573          * that might have given someone else time to free the head.  If that's
2574          * true, it has been removed from our list and we can move on.
2575          */
2576         if (ret == -EAGAIN)
2577                 head = ERR_PTR(-EAGAIN);
2578
2579         return head;
2580 }
2581
2582 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2583                                     struct btrfs_delayed_ref_head *locked_ref,
2584                                     unsigned long *run_refs)
2585 {
2586         struct btrfs_fs_info *fs_info = trans->fs_info;
2587         struct btrfs_delayed_ref_root *delayed_refs;
2588         struct btrfs_delayed_extent_op *extent_op;
2589         struct btrfs_delayed_ref_node *ref;
2590         int must_insert_reserved = 0;
2591         int ret;
2592
2593         delayed_refs = &trans->transaction->delayed_refs;
2594
2595         lockdep_assert_held(&locked_ref->mutex);
2596         lockdep_assert_held(&locked_ref->lock);
2597
2598         while ((ref = select_delayed_ref(locked_ref))) {
2599                 if (ref->seq &&
2600                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
2601                         spin_unlock(&locked_ref->lock);
2602                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2603                         return -EAGAIN;
2604                 }
2605
2606                 (*run_refs)++;
2607                 ref->in_tree = 0;
2608                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2609                 RB_CLEAR_NODE(&ref->ref_node);
2610                 if (!list_empty(&ref->add_list))
2611                         list_del(&ref->add_list);
2612                 /*
2613                  * When we play the delayed ref, also correct the ref_mod on
2614                  * head
2615                  */
2616                 switch (ref->action) {
2617                 case BTRFS_ADD_DELAYED_REF:
2618                 case BTRFS_ADD_DELAYED_EXTENT:
2619                         locked_ref->ref_mod -= ref->ref_mod;
2620                         break;
2621                 case BTRFS_DROP_DELAYED_REF:
2622                         locked_ref->ref_mod += ref->ref_mod;
2623                         break;
2624                 default:
2625                         WARN_ON(1);
2626                 }
2627                 atomic_dec(&delayed_refs->num_entries);
2628
2629                 /*
2630                  * Record the must_insert_reserved flag before we drop the
2631                  * spin lock.
2632                  */
2633                 must_insert_reserved = locked_ref->must_insert_reserved;
2634                 locked_ref->must_insert_reserved = 0;
2635
2636                 extent_op = locked_ref->extent_op;
2637                 locked_ref->extent_op = NULL;
2638                 spin_unlock(&locked_ref->lock);
2639
2640                 ret = run_one_delayed_ref(trans, ref, extent_op,
2641                                           must_insert_reserved);
2642
2643                 btrfs_free_delayed_extent_op(extent_op);
2644                 if (ret) {
2645                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2646                         btrfs_put_delayed_ref(ref);
2647                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2648                                     ret);
2649                         return ret;
2650                 }
2651
2652                 btrfs_put_delayed_ref(ref);
2653                 cond_resched();
2654
2655                 spin_lock(&locked_ref->lock);
2656                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2657         }
2658
2659         return 0;
2660 }
2661
2662 /*
2663  * Returns 0 on success or if called with an already aborted transaction.
2664  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2665  */
2666 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2667                                              unsigned long nr)
2668 {
2669         struct btrfs_fs_info *fs_info = trans->fs_info;
2670         struct btrfs_delayed_ref_root *delayed_refs;
2671         struct btrfs_delayed_ref_head *locked_ref = NULL;
2672         ktime_t start = ktime_get();
2673         int ret;
2674         unsigned long count = 0;
2675         unsigned long actual_count = 0;
2676
2677         delayed_refs = &trans->transaction->delayed_refs;
2678         do {
2679                 if (!locked_ref) {
2680                         locked_ref = btrfs_obtain_ref_head(trans);
2681                         if (IS_ERR_OR_NULL(locked_ref)) {
2682                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2683                                         continue;
2684                                 } else {
2685                                         break;
2686                                 }
2687                         }
2688                         count++;
2689                 }
2690                 /*
2691                  * We need to try and merge add/drops of the same ref since we
2692                  * can run into issues with relocate dropping the implicit ref
2693                  * and then it being added back again before the drop can
2694                  * finish.  If we merged anything we need to re-loop so we can
2695                  * get a good ref.
2696                  * Or we can get node references of the same type that weren't
2697                  * merged when created due to bumps in the tree mod seq, and
2698                  * we need to merge them to prevent adding an inline extent
2699                  * backref before dropping it (triggering a BUG_ON at
2700                  * insert_inline_extent_backref()).
2701                  */
2702                 spin_lock(&locked_ref->lock);
2703                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2704
2705                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2706                                                       &actual_count);
2707                 if (ret < 0 && ret != -EAGAIN) {
2708                         /*
2709                          * Error, btrfs_run_delayed_refs_for_head already
2710                          * unlocked everything so just bail out
2711                          */
2712                         return ret;
2713                 } else if (!ret) {
2714                         /*
2715                          * Success, perform the usual cleanup of a processed
2716                          * head
2717                          */
2718                         ret = cleanup_ref_head(trans, locked_ref);
2719                         if (ret > 0 ) {
2720                                 /* We dropped our lock, we need to loop. */
2721                                 ret = 0;
2722                                 continue;
2723                         } else if (ret) {
2724                                 return ret;
2725                         }
2726                 }
2727
2728                 /*
2729                  * Either success case or btrfs_run_delayed_refs_for_head
2730                  * returned -EAGAIN, meaning we need to select another head
2731                  */
2732
2733                 locked_ref = NULL;
2734                 cond_resched();
2735         } while ((nr != -1 && count < nr) || locked_ref);
2736
2737         /*
2738          * We don't want to include ref heads since we can have empty ref heads
2739          * and those will drastically skew our runtime down since we just do
2740          * accounting, no actual extent tree updates.
2741          */
2742         if (actual_count > 0) {
2743                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2744                 u64 avg;
2745
2746                 /*
2747                  * We weigh the current average higher than our current runtime
2748                  * to avoid large swings in the average.
2749                  */
2750                 spin_lock(&delayed_refs->lock);
2751                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2752                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2753                 spin_unlock(&delayed_refs->lock);
2754         }
2755         return 0;
2756 }
2757
2758 #ifdef SCRAMBLE_DELAYED_REFS
2759 /*
2760  * Normally delayed refs get processed in ascending bytenr order. This
2761  * correlates in most cases to the order added. To expose dependencies on this
2762  * order, we start to process the tree in the middle instead of the beginning
2763  */
2764 static u64 find_middle(struct rb_root *root)
2765 {
2766         struct rb_node *n = root->rb_node;
2767         struct btrfs_delayed_ref_node *entry;
2768         int alt = 1;
2769         u64 middle;
2770         u64 first = 0, last = 0;
2771
2772         n = rb_first(root);
2773         if (n) {
2774                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2775                 first = entry->bytenr;
2776         }
2777         n = rb_last(root);
2778         if (n) {
2779                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2780                 last = entry->bytenr;
2781         }
2782         n = root->rb_node;
2783
2784         while (n) {
2785                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2786                 WARN_ON(!entry->in_tree);
2787
2788                 middle = entry->bytenr;
2789
2790                 if (alt)
2791                         n = n->rb_left;
2792                 else
2793                         n = n->rb_right;
2794
2795                 alt = 1 - alt;
2796         }
2797         return middle;
2798 }
2799 #endif
2800
2801 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2802 {
2803         u64 num_bytes;
2804
2805         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2806                              sizeof(struct btrfs_extent_inline_ref));
2807         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2808                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2809
2810         /*
2811          * We don't ever fill up leaves all the way so multiply by 2 just to be
2812          * closer to what we're really going to want to use.
2813          */
2814         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2815 }
2816
2817 /*
2818  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2819  * would require to store the csums for that many bytes.
2820  */
2821 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2822 {
2823         u64 csum_size;
2824         u64 num_csums_per_leaf;
2825         u64 num_csums;
2826
2827         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2828         num_csums_per_leaf = div64_u64(csum_size,
2829                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2830         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2831         num_csums += num_csums_per_leaf - 1;
2832         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2833         return num_csums;
2834 }
2835
2836 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2837 {
2838         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2839         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2840         bool ret = false;
2841         u64 reserved;
2842
2843         spin_lock(&global_rsv->lock);
2844         reserved = global_rsv->reserved;
2845         spin_unlock(&global_rsv->lock);
2846
2847         /*
2848          * Since the global reserve is just kind of magic we don't really want
2849          * to rely on it to save our bacon, so if our size is more than the
2850          * delayed_refs_rsv and the global rsv then it's time to think about
2851          * bailing.
2852          */
2853         spin_lock(&delayed_refs_rsv->lock);
2854         reserved += delayed_refs_rsv->reserved;
2855         if (delayed_refs_rsv->size >= reserved)
2856                 ret = true;
2857         spin_unlock(&delayed_refs_rsv->lock);
2858         return ret;
2859 }
2860
2861 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2862 {
2863         u64 num_entries =
2864                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2865         u64 avg_runtime;
2866         u64 val;
2867
2868         smp_mb();
2869         avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2870         val = num_entries * avg_runtime;
2871         if (val >= NSEC_PER_SEC)
2872                 return 1;
2873         if (val >= NSEC_PER_SEC / 2)
2874                 return 2;
2875
2876         return btrfs_check_space_for_delayed_refs(trans->fs_info);
2877 }
2878
2879 struct async_delayed_refs {
2880         struct btrfs_root *root;
2881         u64 transid;
2882         int count;
2883         int error;
2884         int sync;
2885         struct completion wait;
2886         struct btrfs_work work;
2887 };
2888
2889 static inline struct async_delayed_refs *
2890 to_async_delayed_refs(struct btrfs_work *work)
2891 {
2892         return container_of(work, struct async_delayed_refs, work);
2893 }
2894
2895 static void delayed_ref_async_start(struct btrfs_work *work)
2896 {
2897         struct async_delayed_refs *async = to_async_delayed_refs(work);
2898         struct btrfs_trans_handle *trans;
2899         struct btrfs_fs_info *fs_info = async->root->fs_info;
2900         int ret;
2901
2902         /* if the commit is already started, we don't need to wait here */
2903         if (btrfs_transaction_blocked(fs_info))
2904                 goto done;
2905
2906         trans = btrfs_join_transaction(async->root);
2907         if (IS_ERR(trans)) {
2908                 async->error = PTR_ERR(trans);
2909                 goto done;
2910         }
2911
2912         /* Don't bother flushing if we got into a different transaction */
2913         if (trans->transid > async->transid)
2914                 goto end;
2915
2916         ret = btrfs_run_delayed_refs(trans, async->count);
2917         if (ret)
2918                 async->error = ret;
2919 end:
2920         ret = btrfs_end_transaction(trans);
2921         if (ret && !async->error)
2922                 async->error = ret;
2923 done:
2924         if (async->sync)
2925                 complete(&async->wait);
2926         else
2927                 kfree(async);
2928 }
2929
2930 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2931                                  unsigned long count, u64 transid, int wait)
2932 {
2933         struct async_delayed_refs *async;
2934         int ret;
2935
2936         async = kmalloc(sizeof(*async), GFP_NOFS);
2937         if (!async)
2938                 return -ENOMEM;
2939
2940         async->root = fs_info->tree_root;
2941         async->count = count;
2942         async->error = 0;
2943         async->transid = transid;
2944         if (wait)
2945                 async->sync = 1;
2946         else
2947                 async->sync = 0;
2948         init_completion(&async->wait);
2949
2950         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2951                         delayed_ref_async_start, NULL, NULL);
2952
2953         btrfs_queue_work(fs_info->extent_workers, &async->work);
2954
2955         if (wait) {
2956                 wait_for_completion(&async->wait);
2957                 ret = async->error;
2958                 kfree(async);
2959                 return ret;
2960         }
2961         return 0;
2962 }
2963
2964 /*
2965  * this starts processing the delayed reference count updates and
2966  * extent insertions we have queued up so far.  count can be
2967  * 0, which means to process everything in the tree at the start
2968  * of the run (but not newly added entries), or it can be some target
2969  * number you'd like to process.
2970  *
2971  * Returns 0 on success or if called with an aborted transaction
2972  * Returns <0 on error and aborts the transaction
2973  */
2974 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2975                            unsigned long count)
2976 {
2977         struct btrfs_fs_info *fs_info = trans->fs_info;
2978         struct rb_node *node;
2979         struct btrfs_delayed_ref_root *delayed_refs;
2980         struct btrfs_delayed_ref_head *head;
2981         int ret;
2982         int run_all = count == (unsigned long)-1;
2983
2984         /* We'll clean this up in btrfs_cleanup_transaction */
2985         if (trans->aborted)
2986                 return 0;
2987
2988         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2989                 return 0;
2990
2991         delayed_refs = &trans->transaction->delayed_refs;
2992         if (count == 0)
2993                 count = atomic_read(&delayed_refs->num_entries) * 2;
2994
2995 again:
2996 #ifdef SCRAMBLE_DELAYED_REFS
2997         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2998 #endif
2999         ret = __btrfs_run_delayed_refs(trans, count);
3000         if (ret < 0) {
3001                 btrfs_abort_transaction(trans, ret);
3002                 return ret;
3003         }
3004
3005         if (run_all) {
3006                 btrfs_create_pending_block_groups(trans);
3007
3008                 spin_lock(&delayed_refs->lock);
3009                 node = rb_first_cached(&delayed_refs->href_root);
3010                 if (!node) {
3011                         spin_unlock(&delayed_refs->lock);
3012                         goto out;
3013                 }
3014                 head = rb_entry(node, struct btrfs_delayed_ref_head,
3015                                 href_node);
3016                 refcount_inc(&head->refs);
3017                 spin_unlock(&delayed_refs->lock);
3018
3019                 /* Mutex was contended, block until it's released and retry. */
3020                 mutex_lock(&head->mutex);
3021                 mutex_unlock(&head->mutex);
3022
3023                 btrfs_put_delayed_ref_head(head);
3024                 cond_resched();
3025                 goto again;
3026         }
3027 out:
3028         return 0;
3029 }
3030
3031 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3032                                 struct btrfs_fs_info *fs_info,
3033                                 u64 bytenr, u64 num_bytes, u64 flags,
3034                                 int level, int is_data)
3035 {
3036         struct btrfs_delayed_extent_op *extent_op;
3037         int ret;
3038
3039         extent_op = btrfs_alloc_delayed_extent_op();
3040         if (!extent_op)
3041                 return -ENOMEM;
3042
3043         extent_op->flags_to_set = flags;
3044         extent_op->update_flags = true;
3045         extent_op->update_key = false;
3046         extent_op->is_data = is_data ? true : false;
3047         extent_op->level = level;
3048
3049         ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3050                                           num_bytes, extent_op);
3051         if (ret)
3052                 btrfs_free_delayed_extent_op(extent_op);
3053         return ret;
3054 }
3055
3056 static noinline int check_delayed_ref(struct btrfs_root *root,
3057                                       struct btrfs_path *path,
3058                                       u64 objectid, u64 offset, u64 bytenr)
3059 {
3060         struct btrfs_delayed_ref_head *head;
3061         struct btrfs_delayed_ref_node *ref;
3062         struct btrfs_delayed_data_ref *data_ref;
3063         struct btrfs_delayed_ref_root *delayed_refs;
3064         struct btrfs_transaction *cur_trans;
3065         struct rb_node *node;
3066         int ret = 0;
3067
3068         spin_lock(&root->fs_info->trans_lock);
3069         cur_trans = root->fs_info->running_transaction;
3070         if (cur_trans)
3071                 refcount_inc(&cur_trans->use_count);
3072         spin_unlock(&root->fs_info->trans_lock);
3073         if (!cur_trans)
3074                 return 0;
3075
3076         delayed_refs = &cur_trans->delayed_refs;
3077         spin_lock(&delayed_refs->lock);
3078         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3079         if (!head) {
3080                 spin_unlock(&delayed_refs->lock);
3081                 btrfs_put_transaction(cur_trans);
3082                 return 0;
3083         }
3084
3085         if (!mutex_trylock(&head->mutex)) {
3086                 refcount_inc(&head->refs);
3087                 spin_unlock(&delayed_refs->lock);
3088
3089                 btrfs_release_path(path);
3090
3091                 /*
3092                  * Mutex was contended, block until it's released and let
3093                  * caller try again
3094                  */
3095                 mutex_lock(&head->mutex);
3096                 mutex_unlock(&head->mutex);
3097                 btrfs_put_delayed_ref_head(head);
3098                 btrfs_put_transaction(cur_trans);
3099                 return -EAGAIN;
3100         }
3101         spin_unlock(&delayed_refs->lock);
3102
3103         spin_lock(&head->lock);
3104         /*
3105          * XXX: We should replace this with a proper search function in the
3106          * future.
3107          */
3108         for (node = rb_first_cached(&head->ref_tree); node;
3109              node = rb_next(node)) {
3110                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3111                 /* If it's a shared ref we know a cross reference exists */
3112                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3113                         ret = 1;
3114                         break;
3115                 }
3116
3117                 data_ref = btrfs_delayed_node_to_data_ref(ref);
3118
3119                 /*
3120                  * If our ref doesn't match the one we're currently looking at
3121                  * then we have a cross reference.
3122                  */
3123                 if (data_ref->root != root->root_key.objectid ||
3124                     data_ref->objectid != objectid ||
3125                     data_ref->offset != offset) {
3126                         ret = 1;
3127                         break;
3128                 }
3129         }
3130         spin_unlock(&head->lock);
3131         mutex_unlock(&head->mutex);
3132         btrfs_put_transaction(cur_trans);
3133         return ret;
3134 }
3135
3136 static noinline int check_committed_ref(struct btrfs_root *root,
3137                                         struct btrfs_path *path,
3138                                         u64 objectid, u64 offset, u64 bytenr)
3139 {
3140         struct btrfs_fs_info *fs_info = root->fs_info;
3141         struct btrfs_root *extent_root = fs_info->extent_root;
3142         struct extent_buffer *leaf;
3143         struct btrfs_extent_data_ref *ref;
3144         struct btrfs_extent_inline_ref *iref;
3145         struct btrfs_extent_item *ei;
3146         struct btrfs_key key;
3147         u32 item_size;
3148         int type;
3149         int ret;
3150
3151         key.objectid = bytenr;
3152         key.offset = (u64)-1;
3153         key.type = BTRFS_EXTENT_ITEM_KEY;
3154
3155         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3156         if (ret < 0)
3157                 goto out;
3158         BUG_ON(ret == 0); /* Corruption */
3159
3160         ret = -ENOENT;
3161         if (path->slots[0] == 0)
3162                 goto out;
3163
3164         path->slots[0]--;
3165         leaf = path->nodes[0];
3166         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3167
3168         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3169                 goto out;
3170
3171         ret = 1;
3172         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3173         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3174
3175         if (item_size != sizeof(*ei) +
3176             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3177                 goto out;
3178
3179         if (btrfs_extent_generation(leaf, ei) <=
3180             btrfs_root_last_snapshot(&root->root_item))
3181                 goto out;
3182
3183         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3184
3185         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3186         if (type != BTRFS_EXTENT_DATA_REF_KEY)
3187                 goto out;
3188
3189         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3190         if (btrfs_extent_refs(leaf, ei) !=
3191             btrfs_extent_data_ref_count(leaf, ref) ||
3192             btrfs_extent_data_ref_root(leaf, ref) !=
3193             root->root_key.objectid ||
3194             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3195             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3196                 goto out;
3197
3198         ret = 0;
3199 out:
3200         return ret;
3201 }
3202
3203 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3204                           u64 bytenr)
3205 {
3206         struct btrfs_path *path;
3207         int ret;
3208
3209         path = btrfs_alloc_path();
3210         if (!path)
3211                 return -ENOMEM;
3212
3213         do {
3214                 ret = check_committed_ref(root, path, objectid,
3215                                           offset, bytenr);
3216                 if (ret && ret != -ENOENT)
3217                         goto out;
3218
3219                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3220         } while (ret == -EAGAIN);
3221
3222 out:
3223         btrfs_free_path(path);
3224         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3225                 WARN_ON(ret > 0);
3226         return ret;
3227 }
3228
3229 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3230                            struct btrfs_root *root,
3231                            struct extent_buffer *buf,
3232                            int full_backref, int inc)
3233 {
3234         struct btrfs_fs_info *fs_info = root->fs_info;
3235         u64 bytenr;
3236         u64 num_bytes;
3237         u64 parent;
3238         u64 ref_root;
3239         u32 nritems;
3240         struct btrfs_key key;
3241         struct btrfs_file_extent_item *fi;
3242         int i;
3243         int level;
3244         int ret = 0;
3245         int (*process_func)(struct btrfs_trans_handle *,
3246                             struct btrfs_root *,
3247                             u64, u64, u64, u64, u64, u64);
3248
3249
3250         if (btrfs_is_testing(fs_info))
3251                 return 0;
3252
3253         ref_root = btrfs_header_owner(buf);
3254         nritems = btrfs_header_nritems(buf);
3255         level = btrfs_header_level(buf);
3256
3257         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3258                 return 0;
3259
3260         if (inc)
3261                 process_func = btrfs_inc_extent_ref;
3262         else
3263                 process_func = btrfs_free_extent;
3264
3265         if (full_backref)
3266                 parent = buf->start;
3267         else
3268                 parent = 0;
3269
3270         for (i = 0; i < nritems; i++) {
3271                 if (level == 0) {
3272                         btrfs_item_key_to_cpu(buf, &key, i);
3273                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3274                                 continue;
3275                         fi = btrfs_item_ptr(buf, i,
3276                                             struct btrfs_file_extent_item);
3277                         if (btrfs_file_extent_type(buf, fi) ==
3278                             BTRFS_FILE_EXTENT_INLINE)
3279                                 continue;
3280                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3281                         if (bytenr == 0)
3282                                 continue;
3283
3284                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3285                         key.offset -= btrfs_file_extent_offset(buf, fi);
3286                         ret = process_func(trans, root, bytenr, num_bytes,
3287                                            parent, ref_root, key.objectid,
3288                                            key.offset);
3289                         if (ret)
3290                                 goto fail;
3291                 } else {
3292                         bytenr = btrfs_node_blockptr(buf, i);
3293                         num_bytes = fs_info->nodesize;
3294                         ret = process_func(trans, root, bytenr, num_bytes,
3295                                            parent, ref_root, level - 1, 0);
3296                         if (ret)
3297                                 goto fail;
3298                 }
3299         }
3300         return 0;
3301 fail:
3302         return ret;
3303 }
3304
3305 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3306                   struct extent_buffer *buf, int full_backref)
3307 {
3308         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3309 }
3310
3311 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3312                   struct extent_buffer *buf, int full_backref)
3313 {
3314         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3315 }
3316
3317 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3318                                  struct btrfs_path *path,
3319                                  struct btrfs_block_group_cache *cache)
3320 {
3321         struct btrfs_fs_info *fs_info = trans->fs_info;
3322         int ret;
3323         struct btrfs_root *extent_root = fs_info->extent_root;
3324         unsigned long bi;
3325         struct extent_buffer *leaf;
3326
3327         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3328         if (ret) {
3329                 if (ret > 0)
3330                         ret = -ENOENT;
3331                 goto fail;
3332         }
3333
3334         leaf = path->nodes[0];
3335         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3336         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3337         btrfs_mark_buffer_dirty(leaf);
3338 fail:
3339         btrfs_release_path(path);
3340         return ret;
3341
3342 }
3343
3344 static struct btrfs_block_group_cache *
3345 next_block_group(struct btrfs_fs_info *fs_info,
3346                  struct btrfs_block_group_cache *cache)
3347 {
3348         struct rb_node *node;
3349
3350         spin_lock(&fs_info->block_group_cache_lock);
3351
3352         /* If our block group was removed, we need a full search. */
3353         if (RB_EMPTY_NODE(&cache->cache_node)) {
3354                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3355
3356                 spin_unlock(&fs_info->block_group_cache_lock);
3357                 btrfs_put_block_group(cache);
3358                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3359         }
3360         node = rb_next(&cache->cache_node);
3361         btrfs_put_block_group(cache);
3362         if (node) {
3363                 cache = rb_entry(node, struct btrfs_block_group_cache,
3364                                  cache_node);
3365                 btrfs_get_block_group(cache);
3366         } else
3367                 cache = NULL;
3368         spin_unlock(&fs_info->block_group_cache_lock);
3369         return cache;
3370 }
3371
3372 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3373                             struct btrfs_trans_handle *trans,
3374                             struct btrfs_path *path)
3375 {
3376         struct btrfs_fs_info *fs_info = block_group->fs_info;
3377         struct btrfs_root *root = fs_info->tree_root;
3378         struct inode *inode = NULL;
3379         struct extent_changeset *data_reserved = NULL;
3380         u64 alloc_hint = 0;
3381         int dcs = BTRFS_DC_ERROR;
3382         u64 num_pages = 0;
3383         int retries = 0;
3384         int ret = 0;
3385
3386         /*
3387          * If this block group is smaller than 100 megs don't bother caching the
3388          * block group.
3389          */
3390         if (block_group->key.offset < (100 * SZ_1M)) {
3391                 spin_lock(&block_group->lock);
3392                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3393                 spin_unlock(&block_group->lock);
3394                 return 0;
3395         }
3396
3397         if (trans->aborted)
3398                 return 0;
3399 again:
3400         inode = lookup_free_space_inode(fs_info, block_group, path);
3401         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3402                 ret = PTR_ERR(inode);
3403                 btrfs_release_path(path);
3404                 goto out;
3405         }
3406
3407         if (IS_ERR(inode)) {
3408                 BUG_ON(retries);
3409                 retries++;
3410
3411                 if (block_group->ro)
3412                         goto out_free;
3413
3414                 ret = create_free_space_inode(fs_info, trans, block_group,
3415                                               path);
3416                 if (ret)
3417                         goto out_free;
3418                 goto again;
3419         }
3420
3421         /*
3422          * We want to set the generation to 0, that way if anything goes wrong
3423          * from here on out we know not to trust this cache when we load up next
3424          * time.
3425          */
3426         BTRFS_I(inode)->generation = 0;
3427         ret = btrfs_update_inode(trans, root, inode);
3428         if (ret) {
3429                 /*
3430                  * So theoretically we could recover from this, simply set the
3431                  * super cache generation to 0 so we know to invalidate the
3432                  * cache, but then we'd have to keep track of the block groups
3433                  * that fail this way so we know we _have_ to reset this cache
3434                  * before the next commit or risk reading stale cache.  So to
3435                  * limit our exposure to horrible edge cases lets just abort the
3436                  * transaction, this only happens in really bad situations
3437                  * anyway.
3438                  */
3439                 btrfs_abort_transaction(trans, ret);
3440                 goto out_put;
3441         }
3442         WARN_ON(ret);
3443
3444         /* We've already setup this transaction, go ahead and exit */
3445         if (block_group->cache_generation == trans->transid &&
3446             i_size_read(inode)) {
3447                 dcs = BTRFS_DC_SETUP;
3448                 goto out_put;
3449         }
3450
3451         if (i_size_read(inode) > 0) {
3452                 ret = btrfs_check_trunc_cache_free_space(fs_info,
3453                                         &fs_info->global_block_rsv);
3454                 if (ret)
3455                         goto out_put;
3456
3457                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3458                 if (ret)
3459                         goto out_put;
3460         }
3461
3462         spin_lock(&block_group->lock);
3463         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3464             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3465                 /*
3466                  * don't bother trying to write stuff out _if_
3467                  * a) we're not cached,
3468                  * b) we're with nospace_cache mount option,
3469                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
3470                  */
3471                 dcs = BTRFS_DC_WRITTEN;
3472                 spin_unlock(&block_group->lock);
3473                 goto out_put;
3474         }
3475         spin_unlock(&block_group->lock);
3476
3477         /*
3478          * We hit an ENOSPC when setting up the cache in this transaction, just
3479          * skip doing the setup, we've already cleared the cache so we're safe.
3480          */
3481         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3482                 ret = -ENOSPC;
3483                 goto out_put;
3484         }
3485
3486         /*
3487          * Try to preallocate enough space based on how big the block group is.
3488          * Keep in mind this has to include any pinned space which could end up
3489          * taking up quite a bit since it's not folded into the other space
3490          * cache.
3491          */
3492         num_pages = div_u64(block_group->key.offset, SZ_256M);
3493         if (!num_pages)
3494                 num_pages = 1;
3495
3496         num_pages *= 16;
3497         num_pages *= PAGE_SIZE;
3498
3499         ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3500         if (ret)
3501                 goto out_put;
3502
3503         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3504                                               num_pages, num_pages,
3505                                               &alloc_hint);
3506         /*
3507          * Our cache requires contiguous chunks so that we don't modify a bunch
3508          * of metadata or split extents when writing the cache out, which means
3509          * we can enospc if we are heavily fragmented in addition to just normal
3510          * out of space conditions.  So if we hit this just skip setting up any
3511          * other block groups for this transaction, maybe we'll unpin enough
3512          * space the next time around.
3513          */
3514         if (!ret)
3515                 dcs = BTRFS_DC_SETUP;
3516         else if (ret == -ENOSPC)
3517                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3518
3519 out_put:
3520         iput(inode);
3521 out_free:
3522         btrfs_release_path(path);
3523 out:
3524         spin_lock(&block_group->lock);
3525         if (!ret && dcs == BTRFS_DC_SETUP)
3526                 block_group->cache_generation = trans->transid;
3527         block_group->disk_cache_state = dcs;
3528         spin_unlock(&block_group->lock);
3529
3530         extent_changeset_free(data_reserved);
3531         return ret;
3532 }
3533
3534 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3535                             struct btrfs_fs_info *fs_info)
3536 {
3537         struct btrfs_block_group_cache *cache, *tmp;
3538         struct btrfs_transaction *cur_trans = trans->transaction;
3539         struct btrfs_path *path;
3540
3541         if (list_empty(&cur_trans->dirty_bgs) ||
3542             !btrfs_test_opt(fs_info, SPACE_CACHE))
3543                 return 0;
3544
3545         path = btrfs_alloc_path();
3546         if (!path)
3547                 return -ENOMEM;
3548
3549         /* Could add new block groups, use _safe just in case */
3550         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3551                                  dirty_list) {
3552                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3553                         cache_save_setup(cache, trans, path);
3554         }
3555
3556         btrfs_free_path(path);
3557         return 0;
3558 }
3559
3560 /*
3561  * transaction commit does final block group cache writeback during a
3562  * critical section where nothing is allowed to change the FS.  This is
3563  * required in order for the cache to actually match the block group,
3564  * but can introduce a lot of latency into the commit.
3565  *
3566  * So, btrfs_start_dirty_block_groups is here to kick off block group
3567  * cache IO.  There's a chance we'll have to redo some of it if the
3568  * block group changes again during the commit, but it greatly reduces
3569  * the commit latency by getting rid of the easy block groups while
3570  * we're still allowing others to join the commit.
3571  */
3572 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3573 {
3574         struct btrfs_fs_info *fs_info = trans->fs_info;
3575         struct btrfs_block_group_cache *cache;
3576         struct btrfs_transaction *cur_trans = trans->transaction;
3577         int ret = 0;
3578         int should_put;
3579         struct btrfs_path *path = NULL;
3580         LIST_HEAD(dirty);
3581         struct list_head *io = &cur_trans->io_bgs;
3582         int num_started = 0;
3583         int loops = 0;
3584
3585         spin_lock(&cur_trans->dirty_bgs_lock);
3586         if (list_empty(&cur_trans->dirty_bgs)) {
3587                 spin_unlock(&cur_trans->dirty_bgs_lock);
3588                 return 0;
3589         }
3590         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3591         spin_unlock(&cur_trans->dirty_bgs_lock);
3592
3593 again:
3594         /*
3595          * make sure all the block groups on our dirty list actually
3596          * exist
3597          */
3598         btrfs_create_pending_block_groups(trans);
3599
3600         if (!path) {
3601                 path = btrfs_alloc_path();
3602                 if (!path)
3603                         return -ENOMEM;
3604         }
3605
3606         /*
3607          * cache_write_mutex is here only to save us from balance or automatic
3608          * removal of empty block groups deleting this block group while we are
3609          * writing out the cache
3610          */
3611         mutex_lock(&trans->transaction->cache_write_mutex);
3612         while (!list_empty(&dirty)) {
3613                 bool drop_reserve = true;
3614
3615                 cache = list_first_entry(&dirty,
3616                                          struct btrfs_block_group_cache,
3617                                          dirty_list);
3618                 /*
3619                  * this can happen if something re-dirties a block
3620                  * group that is already under IO.  Just wait for it to
3621                  * finish and then do it all again
3622                  */
3623                 if (!list_empty(&cache->io_list)) {
3624                         list_del_init(&cache->io_list);
3625                         btrfs_wait_cache_io(trans, cache, path);
3626                         btrfs_put_block_group(cache);
3627                 }
3628
3629
3630                 /*
3631                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3632                  * if it should update the cache_state.  Don't delete
3633                  * until after we wait.
3634                  *
3635                  * Since we're not running in the commit critical section
3636                  * we need the dirty_bgs_lock to protect from update_block_group
3637                  */
3638                 spin_lock(&cur_trans->dirty_bgs_lock);
3639                 list_del_init(&cache->dirty_list);
3640                 spin_unlock(&cur_trans->dirty_bgs_lock);
3641
3642                 should_put = 1;
3643
3644                 cache_save_setup(cache, trans, path);
3645
3646                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3647                         cache->io_ctl.inode = NULL;
3648                         ret = btrfs_write_out_cache(fs_info, trans,
3649                                                     cache, path);
3650                         if (ret == 0 && cache->io_ctl.inode) {
3651                                 num_started++;
3652                                 should_put = 0;
3653
3654                                 /*
3655                                  * The cache_write_mutex is protecting the
3656                                  * io_list, also refer to the definition of
3657                                  * btrfs_transaction::io_bgs for more details
3658                                  */
3659                                 list_add_tail(&cache->io_list, io);
3660                         } else {
3661                                 /*
3662                                  * if we failed to write the cache, the
3663                                  * generation will be bad and life goes on
3664                                  */
3665                                 ret = 0;
3666                         }
3667                 }
3668                 if (!ret) {
3669                         ret = write_one_cache_group(trans, path, cache);
3670                         /*
3671                          * Our block group might still be attached to the list
3672                          * of new block groups in the transaction handle of some
3673                          * other task (struct btrfs_trans_handle->new_bgs). This
3674                          * means its block group item isn't yet in the extent
3675                          * tree. If this happens ignore the error, as we will
3676                          * try again later in the critical section of the
3677                          * transaction commit.
3678                          */
3679                         if (ret == -ENOENT) {
3680                                 ret = 0;
3681                                 spin_lock(&cur_trans->dirty_bgs_lock);
3682                                 if (list_empty(&cache->dirty_list)) {
3683                                         list_add_tail(&cache->dirty_list,
3684                                                       &cur_trans->dirty_bgs);
3685                                         btrfs_get_block_group(cache);
3686                                         drop_reserve = false;
3687                                 }
3688                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3689                         } else if (ret) {
3690                                 btrfs_abort_transaction(trans, ret);
3691                         }
3692                 }
3693
3694                 /* if it's not on the io list, we need to put the block group */
3695                 if (should_put)
3696                         btrfs_put_block_group(cache);
3697                 if (drop_reserve)
3698                         btrfs_delayed_refs_rsv_release(fs_info, 1);
3699
3700                 if (ret)
3701                         break;
3702
3703                 /*
3704                  * Avoid blocking other tasks for too long. It might even save
3705                  * us from writing caches for block groups that are going to be
3706                  * removed.
3707                  */
3708                 mutex_unlock(&trans->transaction->cache_write_mutex);
3709                 mutex_lock(&trans->transaction->cache_write_mutex);
3710         }
3711         mutex_unlock(&trans->transaction->cache_write_mutex);
3712
3713         /*
3714          * go through delayed refs for all the stuff we've just kicked off
3715          * and then loop back (just once)
3716          */
3717         ret = btrfs_run_delayed_refs(trans, 0);
3718         if (!ret && loops == 0) {
3719                 loops++;
3720                 spin_lock(&cur_trans->dirty_bgs_lock);
3721                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3722                 /*
3723                  * dirty_bgs_lock protects us from concurrent block group
3724                  * deletes too (not just cache_write_mutex).
3725                  */
3726                 if (!list_empty(&dirty)) {
3727                         spin_unlock(&cur_trans->dirty_bgs_lock);
3728                         goto again;
3729                 }
3730                 spin_unlock(&cur_trans->dirty_bgs_lock);
3731         } else if (ret < 0) {
3732                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3733         }
3734
3735         btrfs_free_path(path);
3736         return ret;
3737 }
3738
3739 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3740                                    struct btrfs_fs_info *fs_info)
3741 {
3742         struct btrfs_block_group_cache *cache;
3743         struct btrfs_transaction *cur_trans = trans->transaction;
3744         int ret = 0;
3745         int should_put;
3746         struct btrfs_path *path;
3747         struct list_head *io = &cur_trans->io_bgs;
3748         int num_started = 0;
3749
3750         path = btrfs_alloc_path();
3751         if (!path)
3752                 return -ENOMEM;
3753
3754         /*
3755          * Even though we are in the critical section of the transaction commit,
3756          * we can still have concurrent tasks adding elements to this
3757          * transaction's list of dirty block groups. These tasks correspond to
3758          * endio free space workers started when writeback finishes for a
3759          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3760          * allocate new block groups as a result of COWing nodes of the root
3761          * tree when updating the free space inode. The writeback for the space
3762          * caches is triggered by an earlier call to
3763          * btrfs_start_dirty_block_groups() and iterations of the following
3764          * loop.
3765          * Also we want to do the cache_save_setup first and then run the
3766          * delayed refs to make sure we have the best chance at doing this all
3767          * in one shot.
3768          */
3769         spin_lock(&cur_trans->dirty_bgs_lock);
3770         while (!list_empty(&cur_trans->dirty_bgs)) {
3771                 cache = list_first_entry(&cur_trans->dirty_bgs,
3772                                          struct btrfs_block_group_cache,
3773                                          dirty_list);
3774
3775                 /*
3776                  * this can happen if cache_save_setup re-dirties a block
3777                  * group that is already under IO.  Just wait for it to
3778                  * finish and then do it all again
3779                  */
3780                 if (!list_empty(&cache->io_list)) {
3781                         spin_unlock(&cur_trans->dirty_bgs_lock);
3782                         list_del_init(&cache->io_list);
3783                         btrfs_wait_cache_io(trans, cache, path);
3784                         btrfs_put_block_group(cache);
3785                         spin_lock(&cur_trans->dirty_bgs_lock);
3786                 }
3787
3788                 /*
3789                  * don't remove from the dirty list until after we've waited
3790                  * on any pending IO
3791                  */
3792                 list_del_init(&cache->dirty_list);
3793                 spin_unlock(&cur_trans->dirty_bgs_lock);
3794                 should_put = 1;
3795
3796                 cache_save_setup(cache, trans, path);
3797
3798                 if (!ret)
3799                         ret = btrfs_run_delayed_refs(trans,
3800                                                      (unsigned long) -1);
3801
3802                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3803                         cache->io_ctl.inode = NULL;
3804                         ret = btrfs_write_out_cache(fs_info, trans,
3805                                                     cache, path);
3806                         if (ret == 0 && cache->io_ctl.inode) {
3807                                 num_started++;
3808                                 should_put = 0;
3809                                 list_add_tail(&cache->io_list, io);
3810                         } else {
3811                                 /*
3812                                  * if we failed to write the cache, the
3813                                  * generation will be bad and life goes on
3814                                  */
3815                                 ret = 0;
3816                         }
3817                 }
3818                 if (!ret) {
3819                         ret = write_one_cache_group(trans, path, cache);
3820                         /*
3821                          * One of the free space endio workers might have
3822                          * created a new block group while updating a free space
3823                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3824                          * and hasn't released its transaction handle yet, in
3825                          * which case the new block group is still attached to
3826                          * its transaction handle and its creation has not
3827                          * finished yet (no block group item in the extent tree
3828                          * yet, etc). If this is the case, wait for all free
3829                          * space endio workers to finish and retry. This is a
3830                          * a very rare case so no need for a more efficient and
3831                          * complex approach.
3832                          */
3833                         if (ret == -ENOENT) {
3834                                 wait_event(cur_trans->writer_wait,
3835                                    atomic_read(&cur_trans->num_writers) == 1);
3836                                 ret = write_one_cache_group(trans, path, cache);
3837                         }
3838                         if (ret)
3839                                 btrfs_abort_transaction(trans, ret);
3840                 }
3841
3842                 /* if its not on the io list, we need to put the block group */
3843                 if (should_put)
3844                         btrfs_put_block_group(cache);
3845                 btrfs_delayed_refs_rsv_release(fs_info, 1);
3846                 spin_lock(&cur_trans->dirty_bgs_lock);
3847         }
3848         spin_unlock(&cur_trans->dirty_bgs_lock);
3849
3850         /*
3851          * Refer to the definition of io_bgs member for details why it's safe
3852          * to use it without any locking
3853          */
3854         while (!list_empty(io)) {
3855                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3856                                          io_list);
3857                 list_del_init(&cache->io_list);
3858                 btrfs_wait_cache_io(trans, cache, path);
3859                 btrfs_put_block_group(cache);
3860         }
3861
3862         btrfs_free_path(path);
3863         return ret;
3864 }
3865
3866 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3867 {
3868         struct btrfs_block_group_cache *block_group;
3869         int readonly = 0;
3870
3871         block_group = btrfs_lookup_block_group(fs_info, bytenr);
3872         if (!block_group || block_group->ro)
3873                 readonly = 1;
3874         if (block_group)
3875                 btrfs_put_block_group(block_group);
3876         return readonly;
3877 }
3878
3879 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3880 {
3881         struct btrfs_block_group_cache *bg;
3882         bool ret = true;
3883
3884         bg = btrfs_lookup_block_group(fs_info, bytenr);
3885         if (!bg)
3886                 return false;
3887
3888         spin_lock(&bg->lock);
3889         if (bg->ro)
3890                 ret = false;
3891         else
3892                 atomic_inc(&bg->nocow_writers);
3893         spin_unlock(&bg->lock);
3894
3895         /* no put on block group, done by btrfs_dec_nocow_writers */
3896         if (!ret)
3897                 btrfs_put_block_group(bg);
3898
3899         return ret;
3900
3901 }
3902
3903 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3904 {
3905         struct btrfs_block_group_cache *bg;
3906
3907         bg = btrfs_lookup_block_group(fs_info, bytenr);
3908         ASSERT(bg);
3909         if (atomic_dec_and_test(&bg->nocow_writers))
3910                 wake_up_var(&bg->nocow_writers);
3911         /*
3912          * Once for our lookup and once for the lookup done by a previous call
3913          * to btrfs_inc_nocow_writers()
3914          */
3915         btrfs_put_block_group(bg);
3916         btrfs_put_block_group(bg);
3917 }
3918
3919 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3920 {
3921         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3922 }
3923
3924 static const char *alloc_name(u64 flags)
3925 {
3926         switch (flags) {
3927         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3928                 return "mixed";
3929         case BTRFS_BLOCK_GROUP_METADATA:
3930                 return "metadata";
3931         case BTRFS_BLOCK_GROUP_DATA:
3932                 return "data";
3933         case BTRFS_BLOCK_GROUP_SYSTEM:
3934                 return "system";
3935         default:
3936                 WARN_ON(1);
3937                 return "invalid-combination";
3938         };
3939 }
3940
3941 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
3942 {
3943
3944         struct btrfs_space_info *space_info;
3945         int i;
3946         int ret;
3947
3948         space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3949         if (!space_info)
3950                 return -ENOMEM;
3951
3952         ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3953                                  GFP_KERNEL);
3954         if (ret) {
3955                 kfree(space_info);
3956                 return ret;
3957         }
3958
3959         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3960                 INIT_LIST_HEAD(&space_info->block_groups[i]);
3961         init_rwsem(&space_info->groups_sem);
3962         spin_lock_init(&space_info->lock);
3963         space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3964         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3965         init_waitqueue_head(&space_info->wait);
3966         INIT_LIST_HEAD(&space_info->ro_bgs);
3967         INIT_LIST_HEAD(&space_info->tickets);
3968         INIT_LIST_HEAD(&space_info->priority_tickets);
3969
3970         ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3971                                     info->space_info_kobj, "%s",
3972                                     alloc_name(space_info->flags));
3973         if (ret) {
3974                 percpu_counter_destroy(&space_info->total_bytes_pinned);
3975                 kfree(space_info);
3976                 return ret;
3977         }
3978
3979         list_add_rcu(&space_info->list, &info->space_info);
3980         if (flags & BTRFS_BLOCK_GROUP_DATA)
3981                 info->data_sinfo = space_info;
3982
3983         return ret;
3984 }
3985
3986 static void update_space_info(struct btrfs_fs_info *info, u64 flags,
3987                              u64 total_bytes, u64 bytes_used,
3988                              u64 bytes_readonly,
3989                              struct btrfs_space_info **space_info)
3990 {
3991         struct btrfs_space_info *found;
3992         int factor;
3993
3994         factor = btrfs_bg_type_to_factor(flags);
3995
3996         found = __find_space_info(info, flags);
3997         ASSERT(found);
3998         spin_lock(&found->lock);
3999         found->total_bytes += total_bytes;
4000         found->disk_total += total_bytes * factor;
4001         found->bytes_used += bytes_used;
4002         found->disk_used += bytes_used * factor;
4003         found->bytes_readonly += bytes_readonly;
4004         if (total_bytes > 0)
4005                 found->full = 0;
4006         space_info_add_new_bytes(info, found, total_bytes -
4007                                  bytes_used - bytes_readonly);
4008         spin_unlock(&found->lock);
4009         *space_info = found;
4010 }
4011
4012 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4013 {
4014         u64 extra_flags = chunk_to_extended(flags) &
4015                                 BTRFS_EXTENDED_PROFILE_MASK;
4016
4017         write_seqlock(&fs_info->profiles_lock);
4018         if (flags & BTRFS_BLOCK_GROUP_DATA)
4019                 fs_info->avail_data_alloc_bits |= extra_flags;
4020         if (flags & BTRFS_BLOCK_GROUP_METADATA)
4021                 fs_info->avail_metadata_alloc_bits |= extra_flags;
4022         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4023                 fs_info->avail_system_alloc_bits |= extra_flags;
4024         write_sequnlock(&fs_info->profiles_lock);
4025 }
4026
4027 /*
4028  * returns target flags in extended format or 0 if restripe for this
4029  * chunk_type is not in progress
4030  *
4031  * should be called with balance_lock held
4032  */
4033 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4034 {
4035         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4036         u64 target = 0;
4037
4038         if (!bctl)
4039                 return 0;
4040
4041         if (flags & BTRFS_BLOCK_GROUP_DATA &&
4042             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4043                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4044         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4045                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4046                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4047         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4048                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4049                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4050         }
4051
4052         return target;
4053 }
4054
4055 /*
4056  * @flags: available profiles in extended format (see ctree.h)
4057  *
4058  * Returns reduced profile in chunk format.  If profile changing is in
4059  * progress (either running or paused) picks the target profile (if it's
4060  * already available), otherwise falls back to plain reducing.
4061  */
4062 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4063 {
4064         u64 num_devices = fs_info->fs_devices->rw_devices;
4065         u64 target;
4066         u64 raid_type;
4067         u64 allowed = 0;
4068
4069         /*
4070          * see if restripe for this chunk_type is in progress, if so
4071          * try to reduce to the target profile
4072          */
4073         spin_lock(&fs_info->balance_lock);
4074         target = get_restripe_target(fs_info, flags);
4075         if (target) {
4076                 /* pick target profile only if it's already available */
4077                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4078                         spin_unlock(&fs_info->balance_lock);
4079                         return extended_to_chunk(target);
4080                 }
4081         }
4082         spin_unlock(&fs_info->balance_lock);
4083
4084         /* First, mask out the RAID levels which aren't possible */
4085         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4086                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4087                         allowed |= btrfs_raid_array[raid_type].bg_flag;
4088         }
4089         allowed &= flags;
4090
4091         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4092                 allowed = BTRFS_BLOCK_GROUP_RAID6;
4093         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4094                 allowed = BTRFS_BLOCK_GROUP_RAID5;
4095         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4096                 allowed = BTRFS_BLOCK_GROUP_RAID10;
4097         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4098                 allowed = BTRFS_BLOCK_GROUP_RAID1;
4099         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4100                 allowed = BTRFS_BLOCK_GROUP_RAID0;
4101
4102         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4103
4104         return extended_to_chunk(flags | allowed);
4105 }
4106
4107 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4108 {
4109         unsigned seq;
4110         u64 flags;
4111
4112         do {
4113                 flags = orig_flags;
4114                 seq = read_seqbegin(&fs_info->profiles_lock);
4115
4116                 if (flags & BTRFS_BLOCK_GROUP_DATA)
4117                         flags |= fs_info->avail_data_alloc_bits;
4118                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4119                         flags |= fs_info->avail_system_alloc_bits;
4120                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4121                         flags |= fs_info->avail_metadata_alloc_bits;
4122         } while (read_seqretry(&fs_info->profiles_lock, seq));
4123
4124         return btrfs_reduce_alloc_profile(fs_info, flags);
4125 }
4126
4127 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4128 {
4129         struct btrfs_fs_info *fs_info = root->fs_info;
4130         u64 flags;
4131         u64 ret;
4132
4133         if (data)
4134                 flags = BTRFS_BLOCK_GROUP_DATA;
4135         else if (root == fs_info->chunk_root)
4136                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
4137         else
4138                 flags = BTRFS_BLOCK_GROUP_METADATA;
4139
4140         ret = get_alloc_profile(fs_info, flags);
4141         return ret;
4142 }
4143
4144 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4145 {
4146         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4147 }
4148
4149 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4150 {
4151         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4152 }
4153
4154 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4155 {
4156         return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4157 }
4158
4159 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4160                                  bool may_use_included)
4161 {
4162         ASSERT(s_info);
4163         return s_info->bytes_used + s_info->bytes_reserved +
4164                 s_info->bytes_pinned + s_info->bytes_readonly +
4165                 (may_use_included ? s_info->bytes_may_use : 0);
4166 }
4167
4168 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4169 {
4170         struct btrfs_root *root = inode->root;
4171         struct btrfs_fs_info *fs_info = root->fs_info;
4172         struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4173         u64 used;
4174         int ret = 0;
4175         int need_commit = 2;
4176         int have_pinned_space;
4177
4178         /* make sure bytes are sectorsize aligned */
4179         bytes = ALIGN(bytes, fs_info->sectorsize);
4180
4181         if (btrfs_is_free_space_inode(inode)) {
4182                 need_commit = 0;
4183                 ASSERT(current->journal_info);
4184         }
4185
4186 again:
4187         /* make sure we have enough space to handle the data first */
4188         spin_lock(&data_sinfo->lock);
4189         used = btrfs_space_info_used(data_sinfo, true);
4190
4191         if (used + bytes > data_sinfo->total_bytes) {
4192                 struct btrfs_trans_handle *trans;
4193
4194                 /*
4195                  * if we don't have enough free bytes in this space then we need
4196                  * to alloc a new chunk.
4197                  */
4198                 if (!data_sinfo->full) {
4199                         u64 alloc_target;
4200
4201                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4202                         spin_unlock(&data_sinfo->lock);
4203
4204                         alloc_target = btrfs_data_alloc_profile(fs_info);
4205                         /*
4206                          * It is ugly that we don't call nolock join
4207                          * transaction for the free space inode case here.
4208                          * But it is safe because we only do the data space
4209                          * reservation for the free space cache in the
4210                          * transaction context, the common join transaction
4211                          * just increase the counter of the current transaction
4212                          * handler, doesn't try to acquire the trans_lock of
4213                          * the fs.
4214                          */
4215                         trans = btrfs_join_transaction(root);
4216                         if (IS_ERR(trans))
4217                                 return PTR_ERR(trans);
4218
4219                         ret = do_chunk_alloc(trans, alloc_target,
4220                                              CHUNK_ALLOC_NO_FORCE);
4221                         btrfs_end_transaction(trans);
4222                         if (ret < 0) {
4223                                 if (ret != -ENOSPC)
4224                                         return ret;
4225                                 else {
4226                                         have_pinned_space = 1;
4227                                         goto commit_trans;
4228                                 }
4229                         }
4230
4231                         goto again;
4232                 }
4233
4234                 /*
4235                  * If we don't have enough pinned space to deal with this
4236                  * allocation, and no removed chunk in current transaction,
4237                  * don't bother committing the transaction.
4238                  */
4239                 have_pinned_space = __percpu_counter_compare(
4240                         &data_sinfo->total_bytes_pinned,
4241                         used + bytes - data_sinfo->total_bytes,
4242                         BTRFS_TOTAL_BYTES_PINNED_BATCH);
4243                 spin_unlock(&data_sinfo->lock);
4244
4245                 /* commit the current transaction and try again */
4246 commit_trans:
4247                 if (need_commit) {
4248                         need_commit--;
4249
4250                         if (need_commit > 0) {
4251                                 btrfs_start_delalloc_roots(fs_info, -1);
4252                                 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4253                                                          (u64)-1);
4254                         }
4255
4256                         trans = btrfs_join_transaction(root);
4257                         if (IS_ERR(trans))
4258                                 return PTR_ERR(trans);
4259                         if (have_pinned_space >= 0 ||
4260                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4261                                      &trans->transaction->flags) ||
4262                             need_commit > 0) {
4263                                 ret = btrfs_commit_transaction(trans);
4264                                 if (ret)
4265                                         return ret;
4266                                 /*
4267                                  * The cleaner kthread might still be doing iput
4268                                  * operations. Wait for it to finish so that
4269                                  * more space is released.  We don't need to
4270                                  * explicitly run the delayed iputs here because
4271                                  * the commit_transaction would have woken up
4272                                  * the cleaner.
4273                                  */
4274                                 ret = btrfs_wait_on_delayed_iputs(fs_info);
4275                                 if (ret)
4276                                         return ret;
4277                                 goto again;
4278                         } else {
4279                                 btrfs_end_transaction(trans);
4280                         }
4281                 }
4282
4283                 trace_btrfs_space_reservation(fs_info,
4284                                               "space_info:enospc",
4285                                               data_sinfo->flags, bytes, 1);
4286                 return -ENOSPC;
4287         }
4288         update_bytes_may_use(data_sinfo, bytes);
4289         trace_btrfs_space_reservation(fs_info, "space_info",
4290                                       data_sinfo->flags, bytes, 1);
4291         spin_unlock(&data_sinfo->lock);
4292
4293         return 0;
4294 }
4295
4296 int btrfs_check_data_free_space(struct inode *inode,
4297                         struct extent_changeset **reserved, u64 start, u64 len)
4298 {
4299         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4300         int ret;
4301
4302         /* align the range */
4303         len = round_up(start + len, fs_info->sectorsize) -
4304               round_down(start, fs_info->sectorsize);
4305         start = round_down(start, fs_info->sectorsize);
4306
4307         ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4308         if (ret < 0)
4309                 return ret;
4310
4311         /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4312         ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4313         if (ret < 0)
4314                 btrfs_free_reserved_data_space_noquota(inode, start, len);
4315         else
4316                 ret = 0;
4317         return ret;
4318 }
4319
4320 /*
4321  * Called if we need to clear a data reservation for this inode
4322  * Normally in a error case.
4323  *
4324  * This one will *NOT* use accurate qgroup reserved space API, just for case
4325  * which we can't sleep and is sure it won't affect qgroup reserved space.
4326  * Like clear_bit_hook().
4327  */
4328 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4329                                             u64 len)
4330 {
4331         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4332         struct btrfs_space_info *data_sinfo;
4333
4334         /* Make sure the range is aligned to sectorsize */
4335         len = round_up(start + len, fs_info->sectorsize) -
4336               round_down(start, fs_info->sectorsize);
4337         start = round_down(start, fs_info->sectorsize);
4338
4339         data_sinfo = fs_info->data_sinfo;
4340         spin_lock(&data_sinfo->lock);
4341         update_bytes_may_use(data_sinfo, -len);
4342         trace_btrfs_space_reservation(fs_info, "space_info",
4343                                       data_sinfo->flags, len, 0);
4344         spin_unlock(&data_sinfo->lock);
4345 }
4346
4347 /*
4348  * Called if we need to clear a data reservation for this inode
4349  * Normally in a error case.
4350  *
4351  * This one will handle the per-inode data rsv map for accurate reserved
4352  * space framework.
4353  */
4354 void btrfs_free_reserved_data_space(struct inode *inode,
4355                         struct extent_changeset *reserved, u64 start, u64 len)
4356 {
4357         struct btrfs_root *root = BTRFS_I(inode)->root;
4358
4359         /* Make sure the range is aligned to sectorsize */
4360         len = round_up(start + len, root->fs_info->sectorsize) -
4361               round_down(start, root->fs_info->sectorsize);
4362         start = round_down(start, root->fs_info->sectorsize);
4363
4364         btrfs_free_reserved_data_space_noquota(inode, start, len);
4365         btrfs_qgroup_free_data(inode, reserved, start, len);
4366 }
4367
4368 static void force_metadata_allocation(struct btrfs_fs_info *info)
4369 {
4370         struct list_head *head = &info->space_info;
4371         struct btrfs_space_info *found;
4372
4373         rcu_read_lock();
4374         list_for_each_entry_rcu(found, head, list) {
4375                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4376                         found->force_alloc = CHUNK_ALLOC_FORCE;
4377         }
4378         rcu_read_unlock();
4379 }
4380
4381 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4382 {
4383         return (global->size << 1);
4384 }
4385
4386 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4387                               struct btrfs_space_info *sinfo, int force)
4388 {
4389         u64 bytes_used = btrfs_space_info_used(sinfo, false);
4390         u64 thresh;
4391
4392         if (force == CHUNK_ALLOC_FORCE)
4393                 return 1;
4394
4395         /*
4396          * in limited mode, we want to have some free space up to
4397          * about 1% of the FS size.
4398          */
4399         if (force == CHUNK_ALLOC_LIMITED) {
4400                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
4401                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4402
4403                 if (sinfo->total_bytes - bytes_used < thresh)
4404                         return 1;
4405         }
4406
4407         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4408                 return 0;
4409         return 1;
4410 }
4411
4412 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4413 {
4414         u64 num_dev;
4415
4416         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4417                     BTRFS_BLOCK_GROUP_RAID0 |
4418                     BTRFS_BLOCK_GROUP_RAID5 |
4419                     BTRFS_BLOCK_GROUP_RAID6))
4420                 num_dev = fs_info->fs_devices->rw_devices;
4421         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4422                 num_dev = 2;
4423         else
4424                 num_dev = 1;    /* DUP or single */
4425
4426         return num_dev;
4427 }
4428
4429 /*
4430  * If @is_allocation is true, reserve space in the system space info necessary
4431  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4432  * removing a chunk.
4433  */
4434 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4435 {
4436         struct btrfs_fs_info *fs_info = trans->fs_info;
4437         struct btrfs_space_info *info;
4438         u64 left;
4439         u64 thresh;
4440         int ret = 0;
4441         u64 num_devs;
4442
4443         /*
4444          * Needed because we can end up allocating a system chunk and for an
4445          * atomic and race free space reservation in the chunk block reserve.
4446          */
4447         lockdep_assert_held(&fs_info->chunk_mutex);
4448
4449         info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4450         spin_lock(&info->lock);
4451         left = info->total_bytes - btrfs_space_info_used(info, true);
4452         spin_unlock(&info->lock);
4453
4454         num_devs = get_profile_num_devs(fs_info, type);
4455
4456         /* num_devs device items to update and 1 chunk item to add or remove */
4457         thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4458                 btrfs_calc_trans_metadata_size(fs_info, 1);
4459
4460         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4461                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4462                            left, thresh, type);
4463                 dump_space_info(fs_info, info, 0, 0);
4464         }
4465
4466         if (left < thresh) {
4467                 u64 flags = btrfs_system_alloc_profile(fs_info);
4468
4469                 /*
4470                  * Ignore failure to create system chunk. We might end up not
4471                  * needing it, as we might not need to COW all nodes/leafs from
4472                  * the paths we visit in the chunk tree (they were already COWed
4473                  * or created in the current transaction for example).
4474                  */
4475                 ret = btrfs_alloc_chunk(trans, flags);
4476         }
4477
4478         if (!ret) {
4479                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4480                                           &fs_info->chunk_block_rsv,
4481                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4482                 if (!ret)
4483                         trans->chunk_bytes_reserved += thresh;
4484         }
4485 }
4486
4487 /*
4488  * If force is CHUNK_ALLOC_FORCE:
4489  *    - return 1 if it successfully allocates a chunk,
4490  *    - return errors including -ENOSPC otherwise.
4491  * If force is NOT CHUNK_ALLOC_FORCE:
4492  *    - return 0 if it doesn't need to allocate a new chunk,
4493  *    - return 1 if it successfully allocates a chunk,
4494  *    - return errors including -ENOSPC otherwise.
4495  */
4496 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4497                           int force)
4498 {
4499         struct btrfs_fs_info *fs_info = trans->fs_info;
4500         struct btrfs_space_info *space_info;
4501         bool wait_for_alloc = false;
4502         bool should_alloc = false;
4503         int ret = 0;
4504
4505         /* Don't re-enter if we're already allocating a chunk */
4506         if (trans->allocating_chunk)
4507                 return -ENOSPC;
4508
4509         space_info = __find_space_info(fs_info, flags);
4510         ASSERT(space_info);
4511
4512         do {
4513                 spin_lock(&space_info->lock);
4514                 if (force < space_info->force_alloc)
4515                         force = space_info->force_alloc;
4516                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4517                 if (space_info->full) {
4518                         /* No more free physical space */
4519                         if (should_alloc)
4520                                 ret = -ENOSPC;
4521                         else
4522                                 ret = 0;
4523                         spin_unlock(&space_info->lock);
4524                         return ret;
4525                 } else if (!should_alloc) {
4526                         spin_unlock(&space_info->lock);
4527                         return 0;
4528                 } else if (space_info->chunk_alloc) {
4529                         /*
4530                          * Someone is already allocating, so we need to block
4531                          * until this someone is finished and then loop to
4532                          * recheck if we should continue with our allocation
4533                          * attempt.
4534                          */
4535                         wait_for_alloc = true;
4536                         spin_unlock(&space_info->lock);
4537                         mutex_lock(&fs_info->chunk_mutex);
4538                         mutex_unlock(&fs_info->chunk_mutex);
4539                 } else {
4540                         /* Proceed with allocation */
4541                         space_info->chunk_alloc = 1;
4542                         wait_for_alloc = false;
4543                         spin_unlock(&space_info->lock);
4544                 }
4545
4546                 cond_resched();
4547         } while (wait_for_alloc);
4548
4549         mutex_lock(&fs_info->chunk_mutex);
4550         trans->allocating_chunk = true;
4551
4552         /*
4553          * If we have mixed data/metadata chunks we want to make sure we keep
4554          * allocating mixed chunks instead of individual chunks.
4555          */
4556         if (btrfs_mixed_space_info(space_info))
4557                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4558
4559         /*
4560          * if we're doing a data chunk, go ahead and make sure that
4561          * we keep a reasonable number of metadata chunks allocated in the
4562          * FS as well.
4563          */
4564         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4565                 fs_info->data_chunk_allocations++;
4566                 if (!(fs_info->data_chunk_allocations %
4567                       fs_info->metadata_ratio))
4568                         force_metadata_allocation(fs_info);
4569         }
4570
4571         /*
4572          * Check if we have enough space in SYSTEM chunk because we may need
4573          * to update devices.
4574          */
4575         check_system_chunk(trans, flags);
4576
4577         ret = btrfs_alloc_chunk(trans, flags);
4578         trans->allocating_chunk = false;
4579
4580         spin_lock(&space_info->lock);
4581         if (ret < 0) {
4582                 if (ret == -ENOSPC)
4583                         space_info->full = 1;
4584                 else
4585                         goto out;
4586         } else {
4587                 ret = 1;
4588                 space_info->max_extent_size = 0;
4589         }
4590
4591         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4592 out:
4593         space_info->chunk_alloc = 0;
4594         spin_unlock(&space_info->lock);
4595         mutex_unlock(&fs_info->chunk_mutex);
4596         /*
4597          * When we allocate a new chunk we reserve space in the chunk block
4598          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4599          * add new nodes/leafs to it if we end up needing to do it when
4600          * inserting the chunk item and updating device items as part of the
4601          * second phase of chunk allocation, performed by
4602          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4603          * large number of new block groups to create in our transaction
4604          * handle's new_bgs list to avoid exhausting the chunk block reserve
4605          * in extreme cases - like having a single transaction create many new
4606          * block groups when starting to write out the free space caches of all
4607          * the block groups that were made dirty during the lifetime of the
4608          * transaction.
4609          */
4610         if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4611                 btrfs_create_pending_block_groups(trans);
4612
4613         return ret;
4614 }
4615
4616 static int can_overcommit(struct btrfs_fs_info *fs_info,
4617                           struct btrfs_space_info *space_info, u64 bytes,
4618                           enum btrfs_reserve_flush_enum flush,
4619                           bool system_chunk)
4620 {
4621         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4622         u64 profile;
4623         u64 space_size;
4624         u64 avail;
4625         u64 used;
4626         int factor;
4627
4628         /* Don't overcommit when in mixed mode. */
4629         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4630                 return 0;
4631
4632         if (system_chunk)
4633                 profile = btrfs_system_alloc_profile(fs_info);
4634         else
4635                 profile = btrfs_metadata_alloc_profile(fs_info);
4636
4637         used = btrfs_space_info_used(space_info, false);
4638
4639         /*
4640          * We only want to allow over committing if we have lots of actual space
4641          * free, but if we don't have enough space to handle the global reserve
4642          * space then we could end up having a real enospc problem when trying
4643          * to allocate a chunk or some other such important allocation.
4644          */
4645         spin_lock(&global_rsv->lock);
4646         space_size = calc_global_rsv_need_space(global_rsv);
4647         spin_unlock(&global_rsv->lock);
4648         if (used + space_size >= space_info->total_bytes)
4649                 return 0;
4650
4651         used += space_info->bytes_may_use;
4652
4653         avail = atomic64_read(&fs_info->free_chunk_space);
4654
4655         /*
4656          * If we have dup, raid1 or raid10 then only half of the free
4657          * space is actually usable.  For raid56, the space info used
4658          * doesn't include the parity drive, so we don't have to
4659          * change the math
4660          */
4661         factor = btrfs_bg_type_to_factor(profile);
4662         avail = div_u64(avail, factor);
4663
4664         /*
4665          * If we aren't flushing all things, let us overcommit up to
4666          * 1/2th of the space. If we can flush, don't let us overcommit
4667          * too much, let it overcommit up to 1/8 of the space.
4668          */
4669         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4670                 avail >>= 3;
4671         else
4672                 avail >>= 1;
4673
4674         if (used + bytes < space_info->total_bytes + avail)
4675                 return 1;
4676         return 0;
4677 }
4678
4679 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4680                                          unsigned long nr_pages, int nr_items)
4681 {
4682         struct super_block *sb = fs_info->sb;
4683
4684         if (down_read_trylock(&sb->s_umount)) {
4685                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4686                 up_read(&sb->s_umount);
4687         } else {
4688                 /*
4689                  * We needn't worry the filesystem going from r/w to r/o though
4690                  * we don't acquire ->s_umount mutex, because the filesystem
4691                  * should guarantee the delalloc inodes list be empty after
4692                  * the filesystem is readonly(all dirty pages are written to
4693                  * the disk).
4694                  */
4695                 btrfs_start_delalloc_roots(fs_info, nr_items);
4696                 if (!current->journal_info)
4697                         btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4698         }
4699 }
4700
4701 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4702                                         u64 to_reclaim)
4703 {
4704         u64 bytes;
4705         u64 nr;
4706
4707         bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4708         nr = div64_u64(to_reclaim, bytes);
4709         if (!nr)
4710                 nr = 1;
4711         return nr;
4712 }
4713
4714 #define EXTENT_SIZE_PER_ITEM    SZ_256K
4715
4716 /*
4717  * shrink metadata reservation for delalloc
4718  */
4719 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4720                             u64 orig, bool wait_ordered)
4721 {
4722         struct btrfs_space_info *space_info;
4723         struct btrfs_trans_handle *trans;
4724         u64 delalloc_bytes;
4725         u64 async_pages;
4726         u64 items;
4727         long time_left;
4728         unsigned long nr_pages;
4729         int loops;
4730
4731         /* Calc the number of the pages we need flush for space reservation */
4732         items = calc_reclaim_items_nr(fs_info, to_reclaim);
4733         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4734
4735         trans = (struct btrfs_trans_handle *)current->journal_info;
4736         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4737
4738         delalloc_bytes = percpu_counter_sum_positive(
4739                                                 &fs_info->delalloc_bytes);
4740         if (delalloc_bytes == 0) {
4741                 if (trans)
4742                         return;
4743                 if (wait_ordered)
4744                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4745                 return;
4746         }
4747
4748         loops = 0;
4749         while (delalloc_bytes && loops < 3) {
4750                 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
4751
4752                 /*
4753                  * Triggers inode writeback for up to nr_pages. This will invoke
4754                  * ->writepages callback and trigger delalloc filling
4755                  *  (btrfs_run_delalloc_range()).
4756                  */
4757                 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4758
4759                 /*
4760                  * We need to wait for the compressed pages to start before
4761                  * we continue.
4762                  */
4763                 async_pages = atomic_read(&fs_info->async_delalloc_pages);
4764                 if (!async_pages)
4765                         goto skip_async;
4766
4767                 /*
4768                  * Calculate how many compressed pages we want to be written
4769                  * before we continue. I.e if there are more async pages than we
4770                  * require wait_event will wait until nr_pages are written.
4771                  */
4772                 if (async_pages <= nr_pages)
4773                         async_pages = 0;
4774                 else
4775                         async_pages -= nr_pages;
4776
4777                 wait_event(fs_info->async_submit_wait,
4778                            atomic_read(&fs_info->async_delalloc_pages) <=
4779                            (int)async_pages);
4780 skip_async:
4781                 spin_lock(&space_info->lock);
4782                 if (list_empty(&space_info->tickets) &&
4783                     list_empty(&space_info->priority_tickets)) {
4784                         spin_unlock(&space_info->lock);
4785                         break;
4786                 }
4787                 spin_unlock(&space_info->lock);
4788
4789                 loops++;
4790                 if (wait_ordered && !trans) {
4791                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4792                 } else {
4793                         time_left = schedule_timeout_killable(1);
4794                         if (time_left)
4795                                 break;
4796                 }
4797                 delalloc_bytes = percpu_counter_sum_positive(
4798                                                 &fs_info->delalloc_bytes);
4799         }
4800 }
4801
4802 struct reserve_ticket {
4803         u64 orig_bytes;
4804         u64 bytes;
4805         int error;
4806         struct list_head list;
4807         wait_queue_head_t wait;
4808 };
4809
4810 /**
4811  * maybe_commit_transaction - possibly commit the transaction if its ok to
4812  * @root - the root we're allocating for
4813  * @bytes - the number of bytes we want to reserve
4814  * @force - force the commit
4815  *
4816  * This will check to make sure that committing the transaction will actually
4817  * get us somewhere and then commit the transaction if it does.  Otherwise it
4818  * will return -ENOSPC.
4819  */
4820 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4821                                   struct btrfs_space_info *space_info)
4822 {
4823         struct reserve_ticket *ticket = NULL;
4824         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4825         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4826         struct btrfs_trans_handle *trans;
4827         u64 bytes_needed;
4828         u64 reclaim_bytes = 0;
4829
4830         trans = (struct btrfs_trans_handle *)current->journal_info;
4831         if (trans)
4832                 return -EAGAIN;
4833
4834         spin_lock(&space_info->lock);
4835         if (!list_empty(&space_info->priority_tickets))
4836                 ticket = list_first_entry(&space_info->priority_tickets,
4837                                           struct reserve_ticket, list);
4838         else if (!list_empty(&space_info->tickets))
4839                 ticket = list_first_entry(&space_info->tickets,
4840                                           struct reserve_ticket, list);
4841         bytes_needed = (ticket) ? ticket->bytes : 0;
4842         spin_unlock(&space_info->lock);
4843
4844         if (!bytes_needed)
4845                 return 0;
4846
4847         trans = btrfs_join_transaction(fs_info->extent_root);
4848         if (IS_ERR(trans))
4849                 return PTR_ERR(trans);
4850
4851         /*
4852          * See if there is enough pinned space to make this reservation, or if
4853          * we have block groups that are going to be freed, allowing us to
4854          * possibly do a chunk allocation the next loop through.
4855          */
4856         if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
4857             __percpu_counter_compare(&space_info->total_bytes_pinned,
4858                                      bytes_needed,
4859                                      BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4860                 goto commit;
4861
4862         /*
4863          * See if there is some space in the delayed insertion reservation for
4864          * this reservation.
4865          */
4866         if (space_info != delayed_rsv->space_info)
4867                 goto enospc;
4868
4869         spin_lock(&delayed_rsv->lock);
4870         reclaim_bytes += delayed_rsv->reserved;
4871         spin_unlock(&delayed_rsv->lock);
4872
4873         spin_lock(&delayed_refs_rsv->lock);
4874         reclaim_bytes += delayed_refs_rsv->reserved;
4875         spin_unlock(&delayed_refs_rsv->lock);
4876         if (reclaim_bytes >= bytes_needed)
4877                 goto commit;
4878         bytes_needed -= reclaim_bytes;
4879
4880         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4881                                    bytes_needed,
4882                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
4883                 goto enospc;
4884
4885 commit:
4886         return btrfs_commit_transaction(trans);
4887 enospc:
4888         btrfs_end_transaction(trans);
4889         return -ENOSPC;
4890 }
4891
4892 /*
4893  * Try to flush some data based on policy set by @state. This is only advisory
4894  * and may fail for various reasons. The caller is supposed to examine the
4895  * state of @space_info to detect the outcome.
4896  */
4897 static void flush_space(struct btrfs_fs_info *fs_info,
4898                        struct btrfs_space_info *space_info, u64 num_bytes,
4899                        int state)
4900 {
4901         struct btrfs_root *root = fs_info->extent_root;
4902         struct btrfs_trans_handle *trans;
4903         int nr;
4904         int ret = 0;
4905
4906         switch (state) {
4907         case FLUSH_DELAYED_ITEMS_NR:
4908         case FLUSH_DELAYED_ITEMS:
4909                 if (state == FLUSH_DELAYED_ITEMS_NR)
4910                         nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4911                 else
4912                         nr = -1;
4913
4914                 trans = btrfs_join_transaction(root);
4915                 if (IS_ERR(trans)) {
4916                         ret = PTR_ERR(trans);
4917                         break;
4918                 }
4919                 ret = btrfs_run_delayed_items_nr(trans, nr);
4920                 btrfs_end_transaction(trans);
4921                 break;
4922         case FLUSH_DELALLOC:
4923         case FLUSH_DELALLOC_WAIT:
4924                 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4925                                 state == FLUSH_DELALLOC_WAIT);
4926                 break;
4927         case FLUSH_DELAYED_REFS_NR:
4928         case FLUSH_DELAYED_REFS:
4929                 trans = btrfs_join_transaction(root);
4930                 if (IS_ERR(trans)) {
4931                         ret = PTR_ERR(trans);
4932                         break;
4933                 }
4934                 if (state == FLUSH_DELAYED_REFS_NR)
4935                         nr = calc_reclaim_items_nr(fs_info, num_bytes);
4936                 else
4937                         nr = 0;
4938                 btrfs_run_delayed_refs(trans, nr);
4939                 btrfs_end_transaction(trans);
4940                 break;
4941         case ALLOC_CHUNK:
4942         case ALLOC_CHUNK_FORCE:
4943                 trans = btrfs_join_transaction(root);
4944                 if (IS_ERR(trans)) {
4945                         ret = PTR_ERR(trans);
4946                         break;
4947                 }
4948                 ret = do_chunk_alloc(trans,
4949                                      btrfs_metadata_alloc_profile(fs_info),
4950                                      (state == ALLOC_CHUNK) ?
4951                                       CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
4952                 btrfs_end_transaction(trans);
4953                 if (ret > 0 || ret == -ENOSPC)
4954                         ret = 0;
4955                 break;
4956         case COMMIT_TRANS:
4957                 /*
4958                  * If we have pending delayed iputs then we could free up a
4959                  * bunch of pinned space, so make sure we run the iputs before
4960                  * we do our pinned bytes check below.
4961                  */
4962                 btrfs_run_delayed_iputs(fs_info);
4963                 btrfs_wait_on_delayed_iputs(fs_info);
4964
4965                 ret = may_commit_transaction(fs_info, space_info);
4966                 break;
4967         default:
4968                 ret = -ENOSPC;
4969                 break;
4970         }
4971
4972         trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4973                                 ret);
4974         return;
4975 }
4976
4977 static inline u64
4978 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4979                                  struct btrfs_space_info *space_info,
4980                                  bool system_chunk)
4981 {
4982         struct reserve_ticket *ticket;
4983         u64 used;
4984         u64 expected;
4985         u64 to_reclaim = 0;
4986
4987         list_for_each_entry(ticket, &space_info->tickets, list)
4988                 to_reclaim += ticket->bytes;
4989         list_for_each_entry(ticket, &space_info->priority_tickets, list)
4990                 to_reclaim += ticket->bytes;
4991         if (to_reclaim)
4992                 return to_reclaim;
4993
4994         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4995         if (can_overcommit(fs_info, space_info, to_reclaim,
4996                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4997                 return 0;
4998
4999         used = btrfs_space_info_used(space_info, true);
5000
5001         if (can_overcommit(fs_info, space_info, SZ_1M,
5002                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5003                 expected = div_factor_fine(space_info->total_bytes, 95);
5004         else
5005                 expected = div_factor_fine(space_info->total_bytes, 90);
5006
5007         if (used > expected)
5008                 to_reclaim = used - expected;
5009         else
5010                 to_reclaim = 0;
5011         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
5012                                      space_info->bytes_reserved);
5013         return to_reclaim;
5014 }
5015
5016 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5017                                         struct btrfs_space_info *space_info,
5018                                         u64 used, bool system_chunk)
5019 {
5020         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
5021
5022         /* If we're just plain full then async reclaim just slows us down. */
5023         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
5024                 return 0;
5025
5026         if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5027                                               system_chunk))
5028                 return 0;
5029
5030         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
5031                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5032 }
5033
5034 static bool wake_all_tickets(struct list_head *head)
5035 {
5036         struct reserve_ticket *ticket;
5037
5038         while (!list_empty(head)) {
5039                 ticket = list_first_entry(head, struct reserve_ticket, list);
5040                 list_del_init(&ticket->list);
5041                 ticket->error = -ENOSPC;
5042                 wake_up(&ticket->wait);
5043                 if (ticket->bytes != ticket->orig_bytes)
5044                         return true;
5045         }
5046         return false;
5047 }
5048
5049 /*
5050  * This is for normal flushers, we can wait all goddamned day if we want to.  We
5051  * will loop and continuously try to flush as long as we are making progress.
5052  * We count progress as clearing off tickets each time we have to loop.
5053  */
5054 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5055 {
5056         struct btrfs_fs_info *fs_info;
5057         struct btrfs_space_info *space_info;
5058         u64 to_reclaim;
5059         int flush_state;
5060         int commit_cycles = 0;
5061         u64 last_tickets_id;
5062
5063         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5064         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5065
5066         spin_lock(&space_info->lock);
5067         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5068                                                       false);
5069         if (!to_reclaim) {
5070                 space_info->flush = 0;
5071                 spin_unlock(&space_info->lock);
5072                 return;
5073         }
5074         last_tickets_id = space_info->tickets_id;
5075         spin_unlock(&space_info->lock);
5076
5077         flush_state = FLUSH_DELAYED_ITEMS_NR;
5078         do {
5079                 flush_space(fs_info, space_info, to_reclaim, flush_state);
5080                 spin_lock(&space_info->lock);
5081                 if (list_empty(&space_info->tickets)) {
5082                         space_info->flush = 0;
5083                         spin_unlock(&space_info->lock);
5084                         return;
5085                 }
5086                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5087                                                               space_info,
5088                                                               false);
5089                 if (last_tickets_id == space_info->tickets_id) {
5090                         flush_state++;
5091                 } else {
5092                         last_tickets_id = space_info->tickets_id;
5093                         flush_state = FLUSH_DELAYED_ITEMS_NR;
5094                         if (commit_cycles)
5095                                 commit_cycles--;
5096                 }
5097
5098                 /*
5099                  * We don't want to force a chunk allocation until we've tried
5100                  * pretty hard to reclaim space.  Think of the case where we
5101                  * freed up a bunch of space and so have a lot of pinned space
5102                  * to reclaim.  We would rather use that than possibly create a
5103                  * underutilized metadata chunk.  So if this is our first run
5104                  * through the flushing state machine skip ALLOC_CHUNK_FORCE and
5105                  * commit the transaction.  If nothing has changed the next go
5106                  * around then we can force a chunk allocation.
5107                  */
5108                 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
5109                         flush_state++;
5110
5111                 if (flush_state > COMMIT_TRANS) {
5112                         commit_cycles++;
5113                         if (commit_cycles > 2) {
5114                                 if (wake_all_tickets(&space_info->tickets)) {
5115                                         flush_state = FLUSH_DELAYED_ITEMS_NR;
5116                                         commit_cycles--;
5117                                 } else {
5118                                         space_info->flush = 0;
5119                                 }
5120                         } else {
5121                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
5122                         }
5123                 }
5124                 spin_unlock(&space_info->lock);
5125         } while (flush_state <= COMMIT_TRANS);
5126 }
5127
5128 void btrfs_init_async_reclaim_work(struct work_struct *work)
5129 {
5130         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5131 }
5132
5133 static const enum btrfs_flush_state priority_flush_states[] = {
5134         FLUSH_DELAYED_ITEMS_NR,
5135         FLUSH_DELAYED_ITEMS,
5136         ALLOC_CHUNK,
5137 };
5138
5139 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5140                                             struct btrfs_space_info *space_info,
5141                                             struct reserve_ticket *ticket)
5142 {
5143         u64 to_reclaim;
5144         int flush_state;
5145
5146         spin_lock(&space_info->lock);
5147         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5148                                                       false);
5149         if (!to_reclaim) {
5150                 spin_unlock(&space_info->lock);
5151                 return;
5152         }
5153         spin_unlock(&space_info->lock);
5154
5155         flush_state = 0;
5156         do {
5157                 flush_space(fs_info, space_info, to_reclaim,
5158                             priority_flush_states[flush_state]);
5159                 flush_state++;
5160                 spin_lock(&space_info->lock);
5161                 if (ticket->bytes == 0) {
5162                         spin_unlock(&space_info->lock);
5163                         return;
5164                 }
5165                 spin_unlock(&space_info->lock);
5166         } while (flush_state < ARRAY_SIZE(priority_flush_states));
5167 }
5168
5169 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5170                                struct btrfs_space_info *space_info,
5171                                struct reserve_ticket *ticket)
5172
5173 {
5174         DEFINE_WAIT(wait);
5175         u64 reclaim_bytes = 0;
5176         int ret = 0;
5177
5178         spin_lock(&space_info->lock);
5179         while (ticket->bytes > 0 && ticket->error == 0) {
5180                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5181                 if (ret) {
5182                         ret = -EINTR;
5183                         break;
5184                 }
5185                 spin_unlock(&space_info->lock);
5186
5187                 schedule();
5188
5189                 finish_wait(&ticket->wait, &wait);
5190                 spin_lock(&space_info->lock);
5191         }
5192         if (!ret)
5193                 ret = ticket->error;
5194         if (!list_empty(&ticket->list))
5195                 list_del_init(&ticket->list);
5196         if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
5197                 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
5198         spin_unlock(&space_info->lock);
5199
5200         if (reclaim_bytes)
5201                 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
5202         return ret;
5203 }
5204
5205 /**
5206  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5207  * @root - the root we're allocating for
5208  * @space_info - the space info we want to allocate from
5209  * @orig_bytes - the number of bytes we want
5210  * @flush - whether or not we can flush to make our reservation
5211  *
5212  * This will reserve orig_bytes number of bytes from the space info associated
5213  * with the block_rsv.  If there is not enough space it will make an attempt to
5214  * flush out space to make room.  It will do this by flushing delalloc if
5215  * possible or committing the transaction.  If flush is 0 then no attempts to
5216  * regain reservations will be made and this will fail if there is not enough
5217  * space already.
5218  */
5219 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5220                                     struct btrfs_space_info *space_info,
5221                                     u64 orig_bytes,
5222                                     enum btrfs_reserve_flush_enum flush,
5223                                     bool system_chunk)
5224 {
5225         struct reserve_ticket ticket;
5226         u64 used;
5227         u64 reclaim_bytes = 0;
5228         int ret = 0;
5229
5230         ASSERT(orig_bytes);
5231         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5232
5233         spin_lock(&space_info->lock);
5234         ret = -ENOSPC;
5235         used = btrfs_space_info_used(space_info, true);
5236
5237         /*
5238          * If we have enough space then hooray, make our reservation and carry
5239          * on.  If not see if we can overcommit, and if we can, hooray carry on.
5240          * If not things get more complicated.
5241          */
5242         if (used + orig_bytes <= space_info->total_bytes) {
5243                 update_bytes_may_use(space_info, orig_bytes);
5244                 trace_btrfs_space_reservation(fs_info, "space_info",
5245                                               space_info->flags, orig_bytes, 1);
5246                 ret = 0;
5247         } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5248                                   system_chunk)) {
5249                 update_bytes_may_use(space_info, orig_bytes);
5250                 trace_btrfs_space_reservation(fs_info, "space_info",
5251                                               space_info->flags, orig_bytes, 1);
5252                 ret = 0;
5253         }
5254
5255         /*
5256          * If we couldn't make a reservation then setup our reservation ticket
5257          * and kick the async worker if it's not already running.
5258          *
5259          * If we are a priority flusher then we just need to add our ticket to
5260          * the list and we will do our own flushing further down.
5261          */
5262         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5263                 ticket.orig_bytes = orig_bytes;
5264                 ticket.bytes = orig_bytes;
5265                 ticket.error = 0;
5266                 init_waitqueue_head(&ticket.wait);
5267                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5268                         list_add_tail(&ticket.list, &space_info->tickets);
5269                         if (!space_info->flush) {
5270                                 space_info->flush = 1;
5271                                 trace_btrfs_trigger_flush(fs_info,
5272                                                           space_info->flags,
5273                                                           orig_bytes, flush,
5274                                                           "enospc");
5275                                 queue_work(system_unbound_wq,
5276                                            &fs_info->async_reclaim_work);
5277                         }
5278                 } else {
5279                         list_add_tail(&ticket.list,
5280                                       &space_info->priority_tickets);
5281                 }
5282         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5283                 used += orig_bytes;
5284                 /*
5285                  * We will do the space reservation dance during log replay,
5286                  * which means we won't have fs_info->fs_root set, so don't do
5287                  * the async reclaim as we will panic.
5288                  */
5289                 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5290                     need_do_async_reclaim(fs_info, space_info,
5291                                           used, system_chunk) &&
5292                     !work_busy(&fs_info->async_reclaim_work)) {
5293                         trace_btrfs_trigger_flush(fs_info, space_info->flags,
5294                                                   orig_bytes, flush, "preempt");
5295                         queue_work(system_unbound_wq,
5296                                    &fs_info->async_reclaim_work);
5297                 }
5298         }
5299         spin_unlock(&space_info->lock);
5300         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5301                 return ret;
5302
5303         if (flush == BTRFS_RESERVE_FLUSH_ALL)
5304                 return wait_reserve_ticket(fs_info, space_info, &ticket);
5305
5306         ret = 0;
5307         priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5308         spin_lock(&space_info->lock);
5309         if (ticket.bytes) {
5310                 if (ticket.bytes < orig_bytes)
5311                         reclaim_bytes = orig_bytes - ticket.bytes;
5312                 list_del_init(&ticket.list);
5313                 ret = -ENOSPC;
5314         }
5315         spin_unlock(&space_info->lock);
5316
5317         if (reclaim_bytes)
5318                 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
5319         ASSERT(list_empty(&ticket.list));
5320         return ret;
5321 }
5322
5323 /**
5324  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5325  * @root - the root we're allocating for
5326  * @block_rsv - the block_rsv we're allocating for
5327  * @orig_bytes - the number of bytes we want
5328  * @flush - whether or not we can flush to make our reservation
5329  *
5330  * This will reserve orig_bytes number of bytes from the space info associated
5331  * with the block_rsv.  If there is not enough space it will make an attempt to
5332  * flush out space to make room.  It will do this by flushing delalloc if
5333  * possible or committing the transaction.  If flush is 0 then no attempts to
5334  * regain reservations will be made and this will fail if there is not enough
5335  * space already.
5336  */
5337 static int reserve_metadata_bytes(struct btrfs_root *root,
5338                                   struct btrfs_block_rsv *block_rsv,
5339                                   u64 orig_bytes,
5340                                   enum btrfs_reserve_flush_enum flush)
5341 {
5342         struct btrfs_fs_info *fs_info = root->fs_info;
5343         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5344         int ret;
5345         bool system_chunk = (root == fs_info->chunk_root);
5346
5347         ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5348                                        orig_bytes, flush, system_chunk);
5349         if (ret == -ENOSPC &&
5350             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5351                 if (block_rsv != global_rsv &&
5352                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5353                         ret = 0;
5354         }
5355         if (ret == -ENOSPC) {
5356                 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5357                                               block_rsv->space_info->flags,
5358                                               orig_bytes, 1);
5359
5360                 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5361                         dump_space_info(fs_info, block_rsv->space_info,
5362                                         orig_bytes, 0);
5363         }
5364         return ret;
5365 }
5366
5367 static struct btrfs_block_rsv *get_block_rsv(
5368                                         const struct btrfs_trans_handle *trans,
5369                                         const struct btrfs_root *root)
5370 {
5371         struct btrfs_fs_info *fs_info = root->fs_info;
5372         struct btrfs_block_rsv *block_rsv = NULL;
5373
5374         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5375             (root == fs_info->csum_root && trans->adding_csums) ||
5376             (root == fs_info->uuid_root))
5377                 block_rsv = trans->block_rsv;
5378
5379         if (!block_rsv)
5380                 block_rsv = root->block_rsv;
5381
5382         if (!block_rsv)
5383                 block_rsv = &fs_info->empty_block_rsv;
5384
5385         return block_rsv;
5386 }
5387
5388 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5389                                u64 num_bytes)
5390 {
5391         int ret = -ENOSPC;
5392         spin_lock(&block_rsv->lock);
5393         if (block_rsv->reserved >= num_bytes) {
5394                 block_rsv->reserved -= num_bytes;
5395                 if (block_rsv->reserved < block_rsv->size)
5396                         block_rsv->full = 0;
5397                 ret = 0;
5398         }
5399         spin_unlock(&block_rsv->lock);
5400         return ret;
5401 }
5402
5403 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5404                                 u64 num_bytes, bool update_size)
5405 {
5406         spin_lock(&block_rsv->lock);
5407         block_rsv->reserved += num_bytes;
5408         if (update_size)
5409                 block_rsv->size += num_bytes;
5410         else if (block_rsv->reserved >= block_rsv->size)
5411                 block_rsv->full = 1;
5412         spin_unlock(&block_rsv->lock);
5413 }
5414
5415 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5416                              struct btrfs_block_rsv *dest, u64 num_bytes,
5417                              int min_factor)
5418 {
5419         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5420         u64 min_bytes;
5421
5422         if (global_rsv->space_info != dest->space_info)
5423                 return -ENOSPC;
5424
5425         spin_lock(&global_rsv->lock);
5426         min_bytes = div_factor(global_rsv->size, min_factor);
5427         if (global_rsv->reserved < min_bytes + num_bytes) {
5428                 spin_unlock(&global_rsv->lock);
5429                 return -ENOSPC;
5430         }
5431         global_rsv->reserved -= num_bytes;
5432         if (global_rsv->reserved < global_rsv->size)
5433                 global_rsv->full = 0;
5434         spin_unlock(&global_rsv->lock);
5435
5436         block_rsv_add_bytes(dest, num_bytes, true);
5437         return 0;
5438 }
5439
5440 /**
5441  * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
5442  * @fs_info - the fs info for our fs.
5443  * @src - the source block rsv to transfer from.
5444  * @num_bytes - the number of bytes to transfer.
5445  *
5446  * This transfers up to the num_bytes amount from the src rsv to the
5447  * delayed_refs_rsv.  Any extra bytes are returned to the space info.
5448  */
5449 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
5450                                        struct btrfs_block_rsv *src,
5451                                        u64 num_bytes)
5452 {
5453         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5454         u64 to_free = 0;
5455
5456         spin_lock(&src->lock);
5457         src->reserved -= num_bytes;
5458         src->size -= num_bytes;
5459         spin_unlock(&src->lock);
5460
5461         spin_lock(&delayed_refs_rsv->lock);
5462         if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
5463                 u64 delta = delayed_refs_rsv->size -
5464                         delayed_refs_rsv->reserved;
5465                 if (num_bytes > delta) {
5466                         to_free = num_bytes - delta;
5467                         num_bytes = delta;
5468                 }
5469         } else {
5470                 to_free = num_bytes;
5471                 num_bytes = 0;
5472         }
5473
5474         if (num_bytes)
5475                 delayed_refs_rsv->reserved += num_bytes;
5476         if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
5477                 delayed_refs_rsv->full = 1;
5478         spin_unlock(&delayed_refs_rsv->lock);
5479
5480         if (num_bytes)
5481                 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5482                                               0, num_bytes, 1);
5483         if (to_free)
5484                 space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
5485                                          to_free);
5486 }
5487
5488 /**
5489  * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
5490  * @fs_info - the fs_info for our fs.
5491  * @flush - control how we can flush for this reservation.
5492  *
5493  * This will refill the delayed block_rsv up to 1 items size worth of space and
5494  * will return -ENOSPC if we can't make the reservation.
5495  */
5496 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
5497                                   enum btrfs_reserve_flush_enum flush)
5498 {
5499         struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5500         u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
5501         u64 num_bytes = 0;
5502         int ret = -ENOSPC;
5503
5504         spin_lock(&block_rsv->lock);
5505         if (block_rsv->reserved < block_rsv->size) {
5506                 num_bytes = block_rsv->size - block_rsv->reserved;
5507                 num_bytes = min(num_bytes, limit);
5508         }
5509         spin_unlock(&block_rsv->lock);
5510
5511         if (!num_bytes)
5512                 return 0;
5513
5514         ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
5515                                      num_bytes, flush);
5516         if (ret)
5517                 return ret;
5518         block_rsv_add_bytes(block_rsv, num_bytes, 0);
5519         trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5520                                       0, num_bytes, 1);
5521         return 0;
5522 }
5523
5524 /*
5525  * This is for space we already have accounted in space_info->bytes_may_use, so
5526  * basically when we're returning space from block_rsv's.
5527  */
5528 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5529                                      struct btrfs_space_info *space_info,
5530                                      u64 num_bytes)
5531 {
5532         struct reserve_ticket *ticket;
5533         struct list_head *head;
5534         u64 used;
5535         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5536         bool check_overcommit = false;
5537
5538         spin_lock(&space_info->lock);
5539         head = &space_info->priority_tickets;
5540
5541         /*
5542          * If we are over our limit then we need to check and see if we can
5543          * overcommit, and if we can't then we just need to free up our space
5544          * and not satisfy any requests.
5545          */
5546         used = btrfs_space_info_used(space_info, true);
5547         if (used - num_bytes >= space_info->total_bytes)
5548                 check_overcommit = true;
5549 again:
5550         while (!list_empty(head) && num_bytes) {
5551                 ticket = list_first_entry(head, struct reserve_ticket,
5552                                           list);
5553                 /*
5554                  * We use 0 bytes because this space is already reserved, so
5555                  * adding the ticket space would be a double count.
5556                  */
5557                 if (check_overcommit &&
5558                     !can_overcommit(fs_info, space_info, 0, flush, false))
5559                         break;
5560                 if (num_bytes >= ticket->bytes) {
5561                         list_del_init(&ticket->list);
5562                         num_bytes -= ticket->bytes;
5563                         ticket->bytes = 0;
5564                         space_info->tickets_id++;
5565                         wake_up(&ticket->wait);
5566                 } else {
5567                         ticket->bytes -= num_bytes;
5568                         num_bytes = 0;
5569                 }
5570         }
5571
5572         if (num_bytes && head == &space_info->priority_tickets) {
5573                 head = &space_info->tickets;
5574                 flush = BTRFS_RESERVE_FLUSH_ALL;
5575                 goto again;
5576         }
5577         update_bytes_may_use(space_info, -num_bytes);
5578         trace_btrfs_space_reservation(fs_info, "space_info",
5579                                       space_info->flags, num_bytes, 0);
5580         spin_unlock(&space_info->lock);
5581 }
5582
5583 /*
5584  * This is for newly allocated space that isn't accounted in
5585  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5586  * we use this helper.
5587  */
5588 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5589                                      struct btrfs_space_info *space_info,
5590                                      u64 num_bytes)
5591 {
5592         struct reserve_ticket *ticket;
5593         struct list_head *head = &space_info->priority_tickets;
5594
5595 again:
5596         while (!list_empty(head) && num_bytes) {
5597                 ticket = list_first_entry(head, struct reserve_ticket,
5598                                           list);
5599                 if (num_bytes >= ticket->bytes) {
5600                         trace_btrfs_space_reservation(fs_info, "space_info",
5601                                                       space_info->flags,
5602                                                       ticket->bytes, 1);
5603                         list_del_init(&ticket->list);
5604                         num_bytes -= ticket->bytes;
5605                         update_bytes_may_use(space_info, ticket->bytes);
5606                         ticket->bytes = 0;
5607                         space_info->tickets_id++;
5608                         wake_up(&ticket->wait);
5609                 } else {
5610                         trace_btrfs_space_reservation(fs_info, "space_info",
5611                                                       space_info->flags,
5612                                                       num_bytes, 1);
5613                         update_bytes_may_use(space_info, num_bytes);
5614                         ticket->bytes -= num_bytes;
5615                         num_bytes = 0;
5616                 }
5617         }
5618
5619         if (num_bytes && head == &space_info->priority_tickets) {
5620                 head = &space_info->tickets;
5621                 goto again;
5622         }
5623 }
5624
5625 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5626                                     struct btrfs_block_rsv *block_rsv,
5627                                     struct btrfs_block_rsv *dest, u64 num_bytes,
5628                                     u64 *qgroup_to_release_ret)
5629 {
5630         struct btrfs_space_info *space_info = block_rsv->space_info;
5631         u64 qgroup_to_release = 0;
5632         u64 ret;
5633
5634         spin_lock(&block_rsv->lock);
5635         if (num_bytes == (u64)-1) {
5636                 num_bytes = block_rsv->size;
5637                 qgroup_to_release = block_rsv->qgroup_rsv_size;
5638         }
5639         block_rsv->size -= num_bytes;
5640         if (block_rsv->reserved >= block_rsv->size) {
5641                 num_bytes = block_rsv->reserved - block_rsv->size;
5642                 block_rsv->reserved = block_rsv->size;
5643                 block_rsv->full = 1;
5644         } else {
5645                 num_bytes = 0;
5646         }
5647         if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5648                 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5649                                     block_rsv->qgroup_rsv_size;
5650                 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5651         } else {
5652                 qgroup_to_release = 0;
5653         }
5654         spin_unlock(&block_rsv->lock);
5655
5656         ret = num_bytes;
5657         if (num_bytes > 0) {
5658                 if (dest) {
5659                         spin_lock(&dest->lock);
5660                         if (!dest->full) {
5661                                 u64 bytes_to_add;
5662
5663                                 bytes_to_add = dest->size - dest->reserved;
5664                                 bytes_to_add = min(num_bytes, bytes_to_add);
5665                                 dest->reserved += bytes_to_add;
5666                                 if (dest->reserved >= dest->size)
5667                                         dest->full = 1;
5668                                 num_bytes -= bytes_to_add;
5669                         }
5670                         spin_unlock(&dest->lock);
5671                 }
5672                 if (num_bytes)
5673                         space_info_add_old_bytes(fs_info, space_info,
5674                                                  num_bytes);
5675         }
5676         if (qgroup_to_release_ret)
5677                 *qgroup_to_release_ret = qgroup_to_release;
5678         return ret;
5679 }
5680
5681 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5682                             struct btrfs_block_rsv *dst, u64 num_bytes,
5683                             bool update_size)
5684 {
5685         int ret;
5686
5687         ret = block_rsv_use_bytes(src, num_bytes);
5688         if (ret)
5689                 return ret;
5690
5691         block_rsv_add_bytes(dst, num_bytes, update_size);
5692         return 0;
5693 }
5694
5695 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5696 {
5697         memset(rsv, 0, sizeof(*rsv));
5698         spin_lock_init(&rsv->lock);
5699         rsv->type = type;
5700 }
5701
5702 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5703                                    struct btrfs_block_rsv *rsv,
5704                                    unsigned short type)
5705 {
5706         btrfs_init_block_rsv(rsv, type);
5707         rsv->space_info = __find_space_info(fs_info,
5708                                             BTRFS_BLOCK_GROUP_METADATA);
5709 }
5710
5711 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5712                                               unsigned short type)
5713 {
5714         struct btrfs_block_rsv *block_rsv;
5715
5716         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5717         if (!block_rsv)
5718                 return NULL;
5719
5720         btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5721         return block_rsv;
5722 }
5723
5724 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5725                           struct btrfs_block_rsv *rsv)
5726 {
5727         if (!rsv)
5728                 return;
5729         btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5730         kfree(rsv);
5731 }
5732
5733 int btrfs_block_rsv_add(struct btrfs_root *root,
5734                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5735                         enum btrfs_reserve_flush_enum flush)
5736 {
5737         int ret;
5738
5739         if (num_bytes == 0)
5740                 return 0;
5741
5742         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5743         if (!ret)
5744                 block_rsv_add_bytes(block_rsv, num_bytes, true);
5745
5746         return ret;
5747 }
5748
5749 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5750 {
5751         u64 num_bytes = 0;
5752         int ret = -ENOSPC;
5753
5754         if (!block_rsv)
5755                 return 0;
5756
5757         spin_lock(&block_rsv->lock);
5758         num_bytes = div_factor(block_rsv->size, min_factor);
5759         if (block_rsv->reserved >= num_bytes)
5760                 ret = 0;
5761         spin_unlock(&block_rsv->lock);
5762
5763         return ret;
5764 }
5765
5766 int btrfs_block_rsv_refill(struct btrfs_root *root,
5767                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5768                            enum btrfs_reserve_flush_enum flush)
5769 {
5770         u64 num_bytes = 0;
5771         int ret = -ENOSPC;
5772
5773         if (!block_rsv)
5774                 return 0;
5775
5776         spin_lock(&block_rsv->lock);
5777         num_bytes = min_reserved;
5778         if (block_rsv->reserved >= num_bytes)
5779                 ret = 0;
5780         else
5781                 num_bytes -= block_rsv->reserved;
5782         spin_unlock(&block_rsv->lock);
5783
5784         if (!ret)
5785                 return 0;
5786
5787         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5788         if (!ret) {
5789                 block_rsv_add_bytes(block_rsv, num_bytes, false);
5790                 return 0;
5791         }
5792
5793         return ret;
5794 }
5795
5796 static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
5797                                 u64 *metadata_bytes, u64 *qgroup_bytes)
5798 {
5799         *metadata_bytes = 0;
5800         *qgroup_bytes = 0;
5801
5802         spin_lock(&block_rsv->lock);
5803         if (block_rsv->reserved < block_rsv->size)
5804                 *metadata_bytes = block_rsv->size - block_rsv->reserved;
5805         if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5806                 *qgroup_bytes = block_rsv->qgroup_rsv_size -
5807                         block_rsv->qgroup_rsv_reserved;
5808         spin_unlock(&block_rsv->lock);
5809 }
5810
5811 /**
5812  * btrfs_inode_rsv_refill - refill the inode block rsv.
5813  * @inode - the inode we are refilling.
5814  * @flush - the flushing restriction.
5815  *
5816  * Essentially the same as btrfs_block_rsv_refill, except it uses the
5817  * block_rsv->size as the minimum size.  We'll either refill the missing amount
5818  * or return if we already have enough space.  This will also handle the reserve
5819  * tracepoint for the reserved amount.
5820  */
5821 static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5822                                   enum btrfs_reserve_flush_enum flush)
5823 {
5824         struct btrfs_root *root = inode->root;
5825         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5826         u64 num_bytes, last = 0;
5827         u64 qgroup_num_bytes;
5828         int ret = -ENOSPC;
5829
5830         calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
5831         if (num_bytes == 0)
5832                 return 0;
5833
5834         do {
5835                 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
5836                                                          true);
5837                 if (ret)
5838                         return ret;
5839                 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5840                 if (ret) {
5841                         btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5842                         last = num_bytes;
5843                         /*
5844                          * If we are fragmented we can end up with a lot of
5845                          * outstanding extents which will make our size be much
5846                          * larger than our reserved amount.
5847                          *
5848                          * If the reservation happens here, it might be very
5849                          * big though not needed in the end, if the delalloc
5850                          * flushing happens.
5851                          *
5852                          * If this is the case try and do the reserve again.
5853                          */
5854                         if (flush == BTRFS_RESERVE_FLUSH_ALL)
5855                                 calc_refill_bytes(block_rsv, &num_bytes,
5856                                                    &qgroup_num_bytes);
5857                         if (num_bytes == 0)
5858                                 return 0;
5859                 }
5860         } while (ret && last != num_bytes);
5861
5862         if (!ret) {
5863                 block_rsv_add_bytes(block_rsv, num_bytes, false);
5864                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5865                                               btrfs_ino(inode), num_bytes, 1);
5866
5867                 /* Don't forget to increase qgroup_rsv_reserved */
5868                 spin_lock(&block_rsv->lock);
5869                 block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5870                 spin_unlock(&block_rsv->lock);
5871         }
5872         return ret;
5873 }
5874
5875 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5876                                      struct btrfs_block_rsv *block_rsv,
5877                                      u64 num_bytes, u64 *qgroup_to_release)
5878 {
5879         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5880         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5881         struct btrfs_block_rsv *target = delayed_rsv;
5882
5883         if (target->full || target == block_rsv)
5884                 target = global_rsv;
5885
5886         if (block_rsv->space_info != target->space_info)
5887                 target = NULL;
5888
5889         return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
5890                                        qgroup_to_release);
5891 }
5892
5893 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5894                              struct btrfs_block_rsv *block_rsv,
5895                              u64 num_bytes)
5896 {
5897         __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
5898 }
5899
5900 /**
5901  * btrfs_inode_rsv_release - release any excessive reservation.
5902  * @inode - the inode we need to release from.
5903  * @qgroup_free - free or convert qgroup meta.
5904  *   Unlike normal operation, qgroup meta reservation needs to know if we are
5905  *   freeing qgroup reservation or just converting it into per-trans.  Normally
5906  *   @qgroup_free is true for error handling, and false for normal release.
5907  *
5908  * This is the same as btrfs_block_rsv_release, except that it handles the
5909  * tracepoint for the reservation.
5910  */
5911 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5912 {
5913         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5914         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5915         u64 released = 0;
5916         u64 qgroup_to_release = 0;
5917
5918         /*
5919          * Since we statically set the block_rsv->size we just want to say we
5920          * are releasing 0 bytes, and then we'll just get the reservation over
5921          * the size free'd.
5922          */
5923         released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
5924                                              &qgroup_to_release);
5925         if (released > 0)
5926                 trace_btrfs_space_reservation(fs_info, "delalloc",
5927                                               btrfs_ino(inode), released, 0);
5928         if (qgroup_free)
5929                 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5930         else
5931                 btrfs_qgroup_convert_reserved_meta(inode->root,
5932                                                    qgroup_to_release);
5933 }
5934
5935 /**
5936  * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
5937  * @fs_info - the fs_info for our fs.
5938  * @nr - the number of items to drop.
5939  *
5940  * This drops the delayed ref head's count from the delayed refs rsv and frees
5941  * any excess reservation we had.
5942  */
5943 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
5944 {
5945         struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5946         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5947         u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
5948         u64 released = 0;
5949
5950         released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
5951                                            num_bytes, NULL);
5952         if (released)
5953                 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5954                                               0, released, 0);
5955 }
5956
5957 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5958 {
5959         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5960         struct btrfs_space_info *sinfo = block_rsv->space_info;
5961         u64 num_bytes;
5962
5963         /*
5964          * The global block rsv is based on the size of the extent tree, the
5965          * checksum tree and the root tree.  If the fs is empty we want to set
5966          * it to a minimal amount for safety.
5967          */
5968         num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5969                 btrfs_root_used(&fs_info->csum_root->root_item) +
5970                 btrfs_root_used(&fs_info->tree_root->root_item);
5971         num_bytes = max_t(u64, num_bytes, SZ_16M);
5972
5973         spin_lock(&sinfo->lock);
5974         spin_lock(&block_rsv->lock);
5975
5976         block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5977
5978         if (block_rsv->reserved < block_rsv->size) {
5979                 num_bytes = btrfs_space_info_used(sinfo, true);
5980                 if (sinfo->total_bytes > num_bytes) {
5981                         num_bytes = sinfo->total_bytes - num_bytes;
5982                         num_bytes = min(num_bytes,
5983                                         block_rsv->size - block_rsv->reserved);
5984                         block_rsv->reserved += num_bytes;
5985                         update_bytes_may_use(sinfo, num_bytes);
5986                         trace_btrfs_space_reservation(fs_info, "space_info",
5987                                                       sinfo->flags, num_bytes,
5988                                                       1);
5989                 }
5990         } else if (block_rsv->reserved > block_rsv->size) {
5991                 num_bytes = block_rsv->reserved - block_rsv->size;
5992                 update_bytes_may_use(sinfo, -num_bytes);
5993                 trace_btrfs_space_reservation(fs_info, "space_info",
5994                                       sinfo->flags, num_bytes, 0);
5995                 block_rsv->reserved = block_rsv->size;
5996         }
5997
5998         if (block_rsv->reserved == block_rsv->size)
5999                 block_rsv->full = 1;
6000         else
6001                 block_rsv->full = 0;
6002
6003         spin_unlock(&block_rsv->lock);
6004         spin_unlock(&sinfo->lock);
6005 }
6006
6007 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
6008 {
6009         struct btrfs_space_info *space_info;
6010
6011         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
6012         fs_info->chunk_block_rsv.space_info = space_info;
6013
6014         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
6015         fs_info->global_block_rsv.space_info = space_info;
6016         fs_info->trans_block_rsv.space_info = space_info;
6017         fs_info->empty_block_rsv.space_info = space_info;
6018         fs_info->delayed_block_rsv.space_info = space_info;
6019         fs_info->delayed_refs_rsv.space_info = space_info;
6020
6021         fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
6022         fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
6023         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
6024         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
6025         if (fs_info->quota_root)
6026                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
6027         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
6028
6029         update_global_block_rsv(fs_info);
6030 }
6031
6032 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
6033 {
6034         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
6035                                 (u64)-1, NULL);
6036         WARN_ON(fs_info->trans_block_rsv.size > 0);
6037         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
6038         WARN_ON(fs_info->chunk_block_rsv.size > 0);
6039         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
6040         WARN_ON(fs_info->delayed_block_rsv.size > 0);
6041         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
6042         WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
6043         WARN_ON(fs_info->delayed_refs_rsv.size > 0);
6044 }
6045
6046 /*
6047  * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
6048  * @trans - the trans that may have generated delayed refs
6049  *
6050  * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
6051  * it'll calculate the additional size and add it to the delayed_refs_rsv.
6052  */
6053 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
6054 {
6055         struct btrfs_fs_info *fs_info = trans->fs_info;
6056         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
6057         u64 num_bytes;
6058
6059         if (!trans->delayed_ref_updates)
6060                 return;
6061
6062         num_bytes = btrfs_calc_trans_metadata_size(fs_info,
6063                                                    trans->delayed_ref_updates);
6064         spin_lock(&delayed_rsv->lock);
6065         delayed_rsv->size += num_bytes;
6066         delayed_rsv->full = 0;
6067         spin_unlock(&delayed_rsv->lock);
6068         trans->delayed_ref_updates = 0;
6069 }
6070
6071 /*
6072  * To be called after all the new block groups attached to the transaction
6073  * handle have been created (btrfs_create_pending_block_groups()).
6074  */
6075 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
6076 {
6077         struct btrfs_fs_info *fs_info = trans->fs_info;
6078
6079         if (!trans->chunk_bytes_reserved)
6080                 return;
6081
6082         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
6083
6084         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
6085                                 trans->chunk_bytes_reserved, NULL);
6086         trans->chunk_bytes_reserved = 0;
6087 }
6088
6089 /*
6090  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
6091  * root: the root of the parent directory
6092  * rsv: block reservation
6093  * items: the number of items that we need do reservation
6094  * use_global_rsv: allow fallback to the global block reservation
6095  *
6096  * This function is used to reserve the space for snapshot/subvolume
6097  * creation and deletion. Those operations are different with the
6098  * common file/directory operations, they change two fs/file trees
6099  * and root tree, the number of items that the qgroup reserves is
6100  * different with the free space reservation. So we can not use
6101  * the space reservation mechanism in start_transaction().
6102  */
6103 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
6104                                      struct btrfs_block_rsv *rsv, int items,
6105                                      bool use_global_rsv)
6106 {
6107         u64 qgroup_num_bytes = 0;
6108         u64 num_bytes;
6109         int ret;
6110         struct btrfs_fs_info *fs_info = root->fs_info;
6111         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6112
6113         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
6114                 /* One for parent inode, two for dir entries */
6115                 qgroup_num_bytes = 3 * fs_info->nodesize;
6116                 ret = btrfs_qgroup_reserve_meta_prealloc(root,
6117                                 qgroup_num_bytes, true);
6118                 if (ret)
6119                         return ret;
6120         }
6121
6122         num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
6123         rsv->space_info = __find_space_info(fs_info,
6124                                             BTRFS_BLOCK_GROUP_METADATA);
6125         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
6126                                   BTRFS_RESERVE_FLUSH_ALL);
6127
6128         if (ret == -ENOSPC && use_global_rsv)
6129                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
6130
6131         if (ret && qgroup_num_bytes)
6132                 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
6133
6134         return ret;
6135 }
6136
6137 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
6138                                       struct btrfs_block_rsv *rsv)
6139 {
6140         btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
6141 }
6142
6143 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
6144                                                  struct btrfs_inode *inode)
6145 {
6146         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
6147         u64 reserve_size = 0;
6148         u64 qgroup_rsv_size = 0;
6149         u64 csum_leaves;
6150         unsigned outstanding_extents;
6151
6152         lockdep_assert_held(&inode->lock);
6153         outstanding_extents = inode->outstanding_extents;
6154         if (outstanding_extents)
6155                 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6156                                                 outstanding_extents + 1);
6157         csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6158                                                  inode->csum_bytes);
6159         reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6160                                                        csum_leaves);
6161         /*
6162          * For qgroup rsv, the calculation is very simple:
6163          * account one nodesize for each outstanding extent
6164          *
6165          * This is overestimating in most cases.
6166          */
6167         qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
6168
6169         spin_lock(&block_rsv->lock);
6170         block_rsv->size = reserve_size;
6171         block_rsv->qgroup_rsv_size = qgroup_rsv_size;
6172         spin_unlock(&block_rsv->lock);
6173 }
6174
6175 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6176 {
6177         struct btrfs_fs_info *fs_info = inode->root->fs_info;
6178         unsigned nr_extents;
6179         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6180         int ret = 0;
6181         bool delalloc_lock = true;
6182
6183         /* If we are a free space inode we need to not flush since we will be in
6184          * the middle of a transaction commit.  We also don't need the delalloc
6185          * mutex since we won't race with anybody.  We need this mostly to make
6186          * lockdep shut its filthy mouth.
6187          *
6188          * If we have a transaction open (can happen if we call truncate_block
6189          * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
6190          */
6191         if (btrfs_is_free_space_inode(inode)) {
6192                 flush = BTRFS_RESERVE_NO_FLUSH;
6193                 delalloc_lock = false;
6194         } else {
6195                 if (current->journal_info)
6196                         flush = BTRFS_RESERVE_FLUSH_LIMIT;
6197
6198                 if (btrfs_transaction_in_commit(fs_info))
6199                         schedule_timeout(1);
6200         }
6201
6202         if (delalloc_lock)
6203                 mutex_lock(&inode->delalloc_mutex);
6204
6205         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6206
6207         /* Add our new extents and calculate the new rsv size. */
6208         spin_lock(&inode->lock);
6209         nr_extents = count_max_extents(num_bytes);
6210         btrfs_mod_outstanding_extents(inode, nr_extents);
6211         inode->csum_bytes += num_bytes;
6212         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6213         spin_unlock(&inode->lock);
6214
6215         ret = btrfs_inode_rsv_refill(inode, flush);
6216         if (unlikely(ret))
6217                 goto out_fail;
6218
6219         if (delalloc_lock)
6220                 mutex_unlock(&inode->delalloc_mutex);
6221         return 0;
6222
6223 out_fail:
6224         spin_lock(&inode->lock);
6225         nr_extents = count_max_extents(num_bytes);
6226         btrfs_mod_outstanding_extents(inode, -nr_extents);
6227         inode->csum_bytes -= num_bytes;
6228         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6229         spin_unlock(&inode->lock);
6230
6231         btrfs_inode_rsv_release(inode, true);
6232         if (delalloc_lock)
6233                 mutex_unlock(&inode->delalloc_mutex);
6234         return ret;
6235 }
6236
6237 /**
6238  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6239  * @inode: the inode to release the reservation for.
6240  * @num_bytes: the number of bytes we are releasing.
6241  * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
6242  *
6243  * This will release the metadata reservation for an inode.  This can be called
6244  * once we complete IO for a given set of bytes to release their metadata
6245  * reservations, or on error for the same reason.
6246  */
6247 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
6248                                      bool qgroup_free)
6249 {
6250         struct btrfs_fs_info *fs_info = inode->root->fs_info;
6251
6252         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6253         spin_lock(&inode->lock);
6254         inode->csum_bytes -= num_bytes;
6255         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6256         spin_unlock(&inode->lock);
6257
6258         if (btrfs_is_testing(fs_info))
6259                 return;
6260
6261         btrfs_inode_rsv_release(inode, qgroup_free);
6262 }
6263
6264 /**
6265  * btrfs_delalloc_release_extents - release our outstanding_extents
6266  * @inode: the inode to balance the reservation for.
6267  * @num_bytes: the number of bytes we originally reserved with
6268  * @qgroup_free: do we need to free qgroup meta reservation or convert them.
6269  *
6270  * When we reserve space we increase outstanding_extents for the extents we may
6271  * add.  Once we've set the range as delalloc or created our ordered extents we
6272  * have outstanding_extents to track the real usage, so we use this to free our
6273  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
6274  * with btrfs_delalloc_reserve_metadata.
6275  */
6276 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
6277                                     bool qgroup_free)
6278 {
6279         struct btrfs_fs_info *fs_info = inode->root->fs_info;
6280         unsigned num_extents;
6281
6282         spin_lock(&inode->lock);
6283         num_extents = count_max_extents(num_bytes);
6284         btrfs_mod_outstanding_extents(inode, -num_extents);
6285         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6286         spin_unlock(&inode->lock);
6287
6288         if (btrfs_is_testing(fs_info))
6289                 return;
6290
6291         btrfs_inode_rsv_release(inode, qgroup_free);
6292 }
6293
6294 /**
6295  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6296  * delalloc
6297  * @inode: inode we're writing to
6298  * @start: start range we are writing to
6299  * @len: how long the range we are writing to
6300  * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6301  *            current reservation.
6302  *
6303  * This will do the following things
6304  *
6305  * o reserve space in data space info for num bytes
6306  *   and reserve precious corresponding qgroup space
6307  *   (Done in check_data_free_space)
6308  *
6309  * o reserve space for metadata space, based on the number of outstanding
6310  *   extents and how much csums will be needed
6311  *   also reserve metadata space in a per root over-reserve method.
6312  * o add to the inodes->delalloc_bytes
6313  * o add it to the fs_info's delalloc inodes list.
6314  *   (Above 3 all done in delalloc_reserve_metadata)
6315  *
6316  * Return 0 for success
6317  * Return <0 for error(-ENOSPC or -EQUOT)
6318  */
6319 int btrfs_delalloc_reserve_space(struct inode *inode,
6320                         struct extent_changeset **reserved, u64 start, u64 len)
6321 {
6322         int ret;
6323
6324         ret = btrfs_check_data_free_space(inode, reserved, start, len);
6325         if (ret < 0)
6326                 return ret;
6327         ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6328         if (ret < 0)
6329                 btrfs_free_reserved_data_space(inode, *reserved, start, len);
6330         return ret;
6331 }
6332
6333 /**
6334  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6335  * @inode: inode we're releasing space for
6336  * @start: start position of the space already reserved
6337  * @len: the len of the space already reserved
6338  * @release_bytes: the len of the space we consumed or didn't use
6339  *
6340  * This function will release the metadata space that was not used and will
6341  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6342  * list if there are no delalloc bytes left.
6343  * Also it will handle the qgroup reserved space.
6344  */
6345 void btrfs_delalloc_release_space(struct inode *inode,
6346                                   struct extent_changeset *reserved,
6347                                   u64 start, u64 len, bool qgroup_free)
6348 {
6349         btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6350         btrfs_free_reserved_data_space(inode, reserved, start, len);
6351 }
6352
6353 static int update_block_group(struct btrfs_trans_handle *trans,
6354                               struct btrfs_fs_info *info, u64 bytenr,
6355                               u64 num_bytes, int alloc)
6356 {
6357         struct btrfs_block_group_cache *cache = NULL;
6358         u64 total = num_bytes;
6359         u64 old_val;
6360         u64 byte_in_group;
6361         int factor;
6362         int ret = 0;
6363
6364         /* block accounting for super block */
6365         spin_lock(&info->delalloc_root_lock);
6366         old_val = btrfs_super_bytes_used(info->super_copy);
6367         if (alloc)
6368                 old_val += num_bytes;
6369         else
6370                 old_val -= num_bytes;
6371         btrfs_set_super_bytes_used(info->super_copy, old_val);
6372         spin_unlock(&info->delalloc_root_lock);
6373
6374         while (total) {
6375                 cache = btrfs_lookup_block_group(info, bytenr);
6376                 if (!cache) {
6377                         ret = -ENOENT;
6378                         break;
6379                 }
6380                 factor = btrfs_bg_type_to_factor(cache->flags);
6381
6382                 /*
6383                  * If this block group has free space cache written out, we
6384                  * need to make sure to load it if we are removing space.  This
6385                  * is because we need the unpinning stage to actually add the
6386                  * space back to the block group, otherwise we will leak space.
6387                  */
6388                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
6389                         cache_block_group(cache, 1);
6390
6391                 byte_in_group = bytenr - cache->key.objectid;
6392                 WARN_ON(byte_in_group > cache->key.offset);
6393
6394                 spin_lock(&cache->space_info->lock);
6395                 spin_lock(&cache->lock);
6396
6397                 if (btrfs_test_opt(info, SPACE_CACHE) &&
6398                     cache->disk_cache_state < BTRFS_DC_CLEAR)
6399                         cache->disk_cache_state = BTRFS_DC_CLEAR;
6400
6401                 old_val = btrfs_block_group_used(&cache->item);
6402                 num_bytes = min(total, cache->key.offset - byte_in_group);
6403                 if (alloc) {
6404                         old_val += num_bytes;
6405                         btrfs_set_block_group_used(&cache->item, old_val);
6406                         cache->reserved -= num_bytes;
6407                         cache->space_info->bytes_reserved -= num_bytes;
6408                         cache->space_info->bytes_used += num_bytes;
6409                         cache->space_info->disk_used += num_bytes * factor;
6410                         spin_unlock(&cache->lock);
6411                         spin_unlock(&cache->space_info->lock);
6412                 } else {
6413                         old_val -= num_bytes;
6414                         btrfs_set_block_group_used(&cache->item, old_val);
6415                         cache->pinned += num_bytes;
6416                         update_bytes_pinned(cache->space_info, num_bytes);
6417                         cache->space_info->bytes_used -= num_bytes;
6418                         cache->space_info->disk_used -= num_bytes * factor;
6419                         spin_unlock(&cache->lock);
6420                         spin_unlock(&cache->space_info->lock);
6421
6422                         trace_btrfs_space_reservation(info, "pinned",
6423                                                       cache->space_info->flags,
6424                                                       num_bytes, 1);
6425                         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6426                                            num_bytes,
6427                                            BTRFS_TOTAL_BYTES_PINNED_BATCH);
6428                         set_extent_dirty(info->pinned_extents,
6429                                          bytenr, bytenr + num_bytes - 1,
6430                                          GFP_NOFS | __GFP_NOFAIL);
6431                 }
6432
6433                 spin_lock(&trans->transaction->dirty_bgs_lock);
6434                 if (list_empty(&cache->dirty_list)) {
6435                         list_add_tail(&cache->dirty_list,
6436                                       &trans->transaction->dirty_bgs);
6437                         trans->transaction->num_dirty_bgs++;
6438                         trans->delayed_ref_updates++;
6439                         btrfs_get_block_group(cache);
6440                 }
6441                 spin_unlock(&trans->transaction->dirty_bgs_lock);
6442
6443                 /*
6444                  * No longer have used bytes in this block group, queue it for
6445                  * deletion. We do this after adding the block group to the
6446                  * dirty list to avoid races between cleaner kthread and space
6447                  * cache writeout.
6448                  */
6449                 if (!alloc && old_val == 0)
6450                         btrfs_mark_bg_unused(cache);
6451
6452                 btrfs_put_block_group(cache);
6453                 total -= num_bytes;
6454                 bytenr += num_bytes;
6455         }
6456
6457         /* Modified block groups are accounted for in the delayed_refs_rsv. */
6458         btrfs_update_delayed_refs_rsv(trans);
6459         return ret;
6460 }
6461
6462 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6463 {
6464         struct btrfs_block_group_cache *cache;
6465         u64 bytenr;
6466
6467         spin_lock(&fs_info->block_group_cache_lock);
6468         bytenr = fs_info->first_logical_byte;
6469         spin_unlock(&fs_info->block_group_cache_lock);
6470
6471         if (bytenr < (u64)-1)
6472                 return bytenr;
6473
6474         cache = btrfs_lookup_first_block_group(fs_info, search_start);
6475         if (!cache)
6476                 return 0;
6477
6478         bytenr = cache->key.objectid;
6479         btrfs_put_block_group(cache);
6480
6481         return bytenr;
6482 }
6483
6484 static int pin_down_extent(struct btrfs_fs_info *fs_info,
6485                            struct btrfs_block_group_cache *cache,
6486                            u64 bytenr, u64 num_bytes, int reserved)
6487 {
6488         spin_lock(&cache->space_info->lock);
6489         spin_lock(&cache->lock);
6490         cache->pinned += num_bytes;
6491         update_bytes_pinned(cache->space_info, num_bytes);
6492         if (reserved) {
6493                 cache->reserved -= num_bytes;
6494                 cache->space_info->bytes_reserved -= num_bytes;
6495         }
6496         spin_unlock(&cache->lock);
6497         spin_unlock(&cache->space_info->lock);
6498
6499         trace_btrfs_space_reservation(fs_info, "pinned",
6500                                       cache->space_info->flags, num_bytes, 1);
6501         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6502                     num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6503         set_extent_dirty(fs_info->pinned_extents, bytenr,
6504                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6505         return 0;
6506 }
6507
6508 /*
6509  * this function must be called within transaction
6510  */
6511 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6512                      u64 bytenr, u64 num_bytes, int reserved)
6513 {
6514         struct btrfs_block_group_cache *cache;
6515
6516         cache = btrfs_lookup_block_group(fs_info, bytenr);
6517         BUG_ON(!cache); /* Logic error */
6518
6519         pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
6520
6521         btrfs_put_block_group(cache);
6522         return 0;
6523 }
6524
6525 /*
6526  * this function must be called within transaction
6527  */
6528 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6529                                     u64 bytenr, u64 num_bytes)
6530 {
6531         struct btrfs_block_group_cache *cache;
6532         int ret;
6533
6534         cache = btrfs_lookup_block_group(fs_info, bytenr);
6535         if (!cache)
6536                 return -EINVAL;
6537
6538         /*
6539          * pull in the free space cache (if any) so that our pin
6540          * removes the free space from the cache.  We have load_only set
6541          * to one because the slow code to read in the free extents does check
6542          * the pinned extents.
6543          */
6544         cache_block_group(cache, 1);
6545
6546         pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
6547
6548         /* remove us from the free space cache (if we're there at all) */
6549         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6550         btrfs_put_block_group(cache);
6551         return ret;
6552 }
6553
6554 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6555                                    u64 start, u64 num_bytes)
6556 {
6557         int ret;
6558         struct btrfs_block_group_cache *block_group;
6559         struct btrfs_caching_control *caching_ctl;
6560
6561         block_group = btrfs_lookup_block_group(fs_info, start);
6562         if (!block_group)
6563                 return -EINVAL;
6564
6565         cache_block_group(block_group, 0);
6566         caching_ctl = get_caching_control(block_group);
6567
6568         if (!caching_ctl) {
6569                 /* Logic error */
6570                 BUG_ON(!block_group_cache_done(block_group));
6571                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6572         } else {
6573                 mutex_lock(&caching_ctl->mutex);
6574
6575                 if (start >= caching_ctl->progress) {
6576                         ret = add_excluded_extent(fs_info, start, num_bytes);
6577                 } else if (start + num_bytes <= caching_ctl->progress) {
6578                         ret = btrfs_remove_free_space(block_group,
6579                                                       start, num_bytes);
6580                 } else {
6581                         num_bytes = caching_ctl->progress - start;
6582                         ret = btrfs_remove_free_space(block_group,
6583                                                       start, num_bytes);
6584                         if (ret)
6585                                 goto out_lock;
6586
6587                         num_bytes = (start + num_bytes) -
6588                                 caching_ctl->progress;
6589                         start = caching_ctl->progress;
6590                         ret = add_excluded_extent(fs_info, start, num_bytes);
6591                 }
6592 out_lock:
6593                 mutex_unlock(&caching_ctl->mutex);
6594                 put_caching_control(caching_ctl);
6595         }
6596         btrfs_put_block_group(block_group);
6597         return ret;
6598 }
6599
6600 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
6601 {
6602         struct btrfs_fs_info *fs_info = eb->fs_info;
6603         struct btrfs_file_extent_item *item;
6604         struct btrfs_key key;
6605         int found_type;
6606         int i;
6607         int ret = 0;
6608
6609         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6610                 return 0;
6611
6612         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6613                 btrfs_item_key_to_cpu(eb, &key, i);
6614                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6615                         continue;
6616                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6617                 found_type = btrfs_file_extent_type(eb, item);
6618                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6619                         continue;
6620                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6621                         continue;
6622                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6623                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6624                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
6625                 if (ret)
6626                         break;
6627         }
6628
6629         return ret;
6630 }
6631
6632 static void
6633 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6634 {
6635         atomic_inc(&bg->reservations);
6636 }
6637
6638 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6639                                         const u64 start)
6640 {
6641         struct btrfs_block_group_cache *bg;
6642
6643         bg = btrfs_lookup_block_group(fs_info, start);
6644         ASSERT(bg);
6645         if (atomic_dec_and_test(&bg->reservations))
6646                 wake_up_var(&bg->reservations);
6647         btrfs_put_block_group(bg);
6648 }
6649
6650 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6651 {
6652         struct btrfs_space_info *space_info = bg->space_info;
6653
6654         ASSERT(bg->ro);
6655
6656         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6657                 return;
6658
6659         /*
6660          * Our block group is read only but before we set it to read only,
6661          * some task might have had allocated an extent from it already, but it
6662          * has not yet created a respective ordered extent (and added it to a
6663          * root's list of ordered extents).
6664          * Therefore wait for any task currently allocating extents, since the
6665          * block group's reservations counter is incremented while a read lock
6666          * on the groups' semaphore is held and decremented after releasing
6667          * the read access on that semaphore and creating the ordered extent.
6668          */
6669         down_write(&space_info->groups_sem);
6670         up_write(&space_info->groups_sem);
6671
6672         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6673 }
6674
6675 /**
6676  * btrfs_add_reserved_bytes - update the block_group and space info counters
6677  * @cache:      The cache we are manipulating
6678  * @ram_bytes:  The number of bytes of file content, and will be same to
6679  *              @num_bytes except for the compress path.
6680  * @num_bytes:  The number of bytes in question
6681  * @delalloc:   The blocks are allocated for the delalloc write
6682  *
6683  * This is called by the allocator when it reserves space. If this is a
6684  * reservation and the block group has become read only we cannot make the
6685  * reservation and return -EAGAIN, otherwise this function always succeeds.
6686  */
6687 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6688                                     u64 ram_bytes, u64 num_bytes, int delalloc)
6689 {
6690         struct btrfs_space_info *space_info = cache->space_info;
6691         int ret = 0;
6692
6693         spin_lock(&space_info->lock);
6694         spin_lock(&cache->lock);
6695         if (cache->ro) {
6696                 ret = -EAGAIN;
6697         } else {
6698                 cache->reserved += num_bytes;
6699                 space_info->bytes_reserved += num_bytes;
6700                 update_bytes_may_use(space_info, -ram_bytes);
6701                 if (delalloc)
6702                         cache->delalloc_bytes += num_bytes;
6703         }
6704         spin_unlock(&cache->lock);
6705         spin_unlock(&space_info->lock);
6706         return ret;
6707 }
6708
6709 /**
6710  * btrfs_free_reserved_bytes - update the block_group and space info counters
6711  * @cache:      The cache we are manipulating
6712  * @num_bytes:  The number of bytes in question
6713  * @delalloc:   The blocks are allocated for the delalloc write
6714  *
6715  * This is called by somebody who is freeing space that was never actually used
6716  * on disk.  For example if you reserve some space for a new leaf in transaction
6717  * A and before transaction A commits you free that leaf, you call this with
6718  * reserve set to 0 in order to clear the reservation.
6719  */
6720
6721 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6722                                       u64 num_bytes, int delalloc)
6723 {
6724         struct btrfs_space_info *space_info = cache->space_info;
6725
6726         spin_lock(&space_info->lock);
6727         spin_lock(&cache->lock);
6728         if (cache->ro)
6729                 space_info->bytes_readonly += num_bytes;
6730         cache->reserved -= num_bytes;
6731         space_info->bytes_reserved -= num_bytes;
6732         space_info->max_extent_size = 0;
6733
6734         if (delalloc)
6735                 cache->delalloc_bytes -= num_bytes;
6736         spin_unlock(&cache->lock);
6737         spin_unlock(&space_info->lock);
6738 }
6739 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6740 {
6741         struct btrfs_caching_control *next;
6742         struct btrfs_caching_control *caching_ctl;
6743         struct btrfs_block_group_cache *cache;
6744
6745         down_write(&fs_info->commit_root_sem);
6746
6747         list_for_each_entry_safe(caching_ctl, next,
6748                                  &fs_info->caching_block_groups, list) {
6749                 cache = caching_ctl->block_group;
6750                 if (block_group_cache_done(cache)) {
6751                         cache->last_byte_to_unpin = (u64)-1;
6752                         list_del_init(&caching_ctl->list);
6753                         put_caching_control(caching_ctl);
6754                 } else {
6755                         cache->last_byte_to_unpin = caching_ctl->progress;
6756                 }
6757         }
6758
6759         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6760                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6761         else
6762                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6763
6764         up_write(&fs_info->commit_root_sem);
6765
6766         update_global_block_rsv(fs_info);
6767 }
6768
6769 /*
6770  * Returns the free cluster for the given space info and sets empty_cluster to
6771  * what it should be based on the mount options.
6772  */
6773 static struct btrfs_free_cluster *
6774 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6775                    struct btrfs_space_info *space_info, u64 *empty_cluster)
6776 {
6777         struct btrfs_free_cluster *ret = NULL;
6778
6779         *empty_cluster = 0;
6780         if (btrfs_mixed_space_info(space_info))
6781                 return ret;
6782
6783         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6784                 ret = &fs_info->meta_alloc_cluster;
6785                 if (btrfs_test_opt(fs_info, SSD))
6786                         *empty_cluster = SZ_2M;
6787                 else
6788                         *empty_cluster = SZ_64K;
6789         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6790                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
6791                 *empty_cluster = SZ_2M;
6792                 ret = &fs_info->data_alloc_cluster;
6793         }
6794
6795         return ret;
6796 }
6797
6798 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6799                               u64 start, u64 end,
6800                               const bool return_free_space)
6801 {
6802         struct btrfs_block_group_cache *cache = NULL;
6803         struct btrfs_space_info *space_info;
6804         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6805         struct btrfs_free_cluster *cluster = NULL;
6806         u64 len;
6807         u64 total_unpinned = 0;
6808         u64 empty_cluster = 0;
6809         bool readonly;
6810
6811         while (start <= end) {
6812                 readonly = false;
6813                 if (!cache ||
6814                     start >= cache->key.objectid + cache->key.offset) {
6815                         if (cache)
6816                                 btrfs_put_block_group(cache);
6817                         total_unpinned = 0;
6818                         cache = btrfs_lookup_block_group(fs_info, start);
6819                         BUG_ON(!cache); /* Logic error */
6820
6821                         cluster = fetch_cluster_info(fs_info,
6822                                                      cache->space_info,
6823                                                      &empty_cluster);
6824                         empty_cluster <<= 1;
6825                 }
6826
6827                 len = cache->key.objectid + cache->key.offset - start;
6828                 len = min(len, end + 1 - start);
6829
6830                 if (start < cache->last_byte_to_unpin) {
6831                         len = min(len, cache->last_byte_to_unpin - start);
6832                         if (return_free_space)
6833                                 btrfs_add_free_space(cache, start, len);
6834                 }
6835
6836                 start += len;
6837                 total_unpinned += len;
6838                 space_info = cache->space_info;
6839
6840                 /*
6841                  * If this space cluster has been marked as fragmented and we've
6842                  * unpinned enough in this block group to potentially allow a
6843                  * cluster to be created inside of it go ahead and clear the
6844                  * fragmented check.
6845                  */
6846                 if (cluster && cluster->fragmented &&
6847                     total_unpinned > empty_cluster) {
6848                         spin_lock(&cluster->lock);
6849                         cluster->fragmented = 0;
6850                         spin_unlock(&cluster->lock);
6851                 }
6852
6853                 spin_lock(&space_info->lock);
6854                 spin_lock(&cache->lock);
6855                 cache->pinned -= len;
6856                 update_bytes_pinned(space_info, -len);
6857
6858                 trace_btrfs_space_reservation(fs_info, "pinned",
6859                                               space_info->flags, len, 0);
6860                 space_info->max_extent_size = 0;
6861                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
6862                             -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6863                 if (cache->ro) {
6864                         space_info->bytes_readonly += len;
6865                         readonly = true;
6866                 }
6867                 spin_unlock(&cache->lock);
6868                 if (!readonly && return_free_space &&
6869                     global_rsv->space_info == space_info) {
6870                         u64 to_add = len;
6871
6872                         spin_lock(&global_rsv->lock);
6873                         if (!global_rsv->full) {
6874                                 to_add = min(len, global_rsv->size -
6875                                              global_rsv->reserved);
6876                                 global_rsv->reserved += to_add;
6877                                 update_bytes_may_use(space_info, to_add);
6878                                 if (global_rsv->reserved >= global_rsv->size)
6879                                         global_rsv->full = 1;
6880                                 trace_btrfs_space_reservation(fs_info,
6881                                                               "space_info",
6882                                                               space_info->flags,
6883                                                               to_add, 1);
6884                                 len -= to_add;
6885                         }
6886                         spin_unlock(&global_rsv->lock);
6887                         /* Add to any tickets we may have */
6888                         if (len)
6889                                 space_info_add_new_bytes(fs_info, space_info,
6890                                                          len);
6891                 }
6892                 spin_unlock(&space_info->lock);
6893         }
6894
6895         if (cache)
6896                 btrfs_put_block_group(cache);
6897         return 0;
6898 }
6899
6900 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6901 {
6902         struct btrfs_fs_info *fs_info = trans->fs_info;
6903         struct btrfs_block_group_cache *block_group, *tmp;
6904         struct list_head *deleted_bgs;
6905         struct extent_io_tree *unpin;
6906         u64 start;
6907         u64 end;
6908         int ret;
6909
6910         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6911                 unpin = &fs_info->freed_extents[1];
6912         else
6913                 unpin = &fs_info->freed_extents[0];
6914
6915         while (!trans->aborted) {
6916                 struct extent_state *cached_state = NULL;
6917
6918                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6919                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6920                                             EXTENT_DIRTY, &cached_state);
6921                 if (ret) {
6922                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6923                         break;
6924                 }
6925
6926                 if (btrfs_test_opt(fs_info, DISCARD))
6927                         ret = btrfs_discard_extent(fs_info, start,
6928                                                    end + 1 - start, NULL);
6929
6930                 clear_extent_dirty(unpin, start, end, &cached_state);
6931                 unpin_extent_range(fs_info, start, end, true);
6932                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6933                 free_extent_state(cached_state);
6934                 cond_resched();
6935         }
6936
6937         /*
6938          * Transaction is finished.  We don't need the lock anymore.  We
6939          * do need to clean up the block groups in case of a transaction
6940          * abort.
6941          */
6942         deleted_bgs = &trans->transaction->deleted_bgs;
6943         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6944                 u64 trimmed = 0;
6945
6946                 ret = -EROFS;
6947                 if (!trans->aborted)
6948                         ret = btrfs_discard_extent(fs_info,
6949                                                    block_group->key.objectid,
6950                                                    block_group->key.offset,
6951                                                    &trimmed);
6952
6953                 list_del_init(&block_group->bg_list);
6954                 btrfs_put_block_group_trimming(block_group);
6955                 btrfs_put_block_group(block_group);
6956
6957                 if (ret) {
6958                         const char *errstr = btrfs_decode_error(ret);
6959                         btrfs_warn(fs_info,
6960                            "discard failed while removing blockgroup: errno=%d %s",
6961                                    ret, errstr);
6962                 }
6963         }
6964
6965         return 0;
6966 }
6967
6968 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6969                                struct btrfs_delayed_ref_node *node, u64 parent,
6970                                u64 root_objectid, u64 owner_objectid,
6971                                u64 owner_offset, int refs_to_drop,
6972                                struct btrfs_delayed_extent_op *extent_op)
6973 {
6974         struct btrfs_fs_info *info = trans->fs_info;
6975         struct btrfs_key key;
6976         struct btrfs_path *path;
6977         struct btrfs_root *extent_root = info->extent_root;
6978         struct extent_buffer *leaf;
6979         struct btrfs_extent_item *ei;
6980         struct btrfs_extent_inline_ref *iref;
6981         int ret;
6982         int is_data;
6983         int extent_slot = 0;
6984         int found_extent = 0;
6985         int num_to_del = 1;
6986         u32 item_size;
6987         u64 refs;
6988         u64 bytenr = node->bytenr;
6989         u64 num_bytes = node->num_bytes;
6990         int last_ref = 0;
6991         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6992
6993         path = btrfs_alloc_path();
6994         if (!path)
6995                 return -ENOMEM;
6996
6997         path->reada = READA_FORWARD;
6998         path->leave_spinning = 1;
6999
7000         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
7001         BUG_ON(!is_data && refs_to_drop != 1);
7002
7003         if (is_data)
7004                 skinny_metadata = false;
7005
7006         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
7007                                     parent, root_objectid, owner_objectid,
7008                                     owner_offset);
7009         if (ret == 0) {
7010                 extent_slot = path->slots[0];
7011                 while (extent_slot >= 0) {
7012                         btrfs_item_key_to_cpu(path->nodes[0], &key,
7013                                               extent_slot);
7014                         if (key.objectid != bytenr)
7015                                 break;
7016                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
7017                             key.offset == num_bytes) {
7018                                 found_extent = 1;
7019                                 break;
7020                         }
7021                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
7022                             key.offset == owner_objectid) {
7023                                 found_extent = 1;
7024                                 break;
7025                         }
7026                         if (path->slots[0] - extent_slot > 5)
7027                                 break;
7028                         extent_slot--;
7029                 }
7030
7031                 if (!found_extent) {
7032                         BUG_ON(iref);
7033                         ret = remove_extent_backref(trans, path, NULL,
7034                                                     refs_to_drop,
7035                                                     is_data, &last_ref);
7036                         if (ret) {
7037                                 btrfs_abort_transaction(trans, ret);
7038                                 goto out;
7039                         }
7040                         btrfs_release_path(path);
7041                         path->leave_spinning = 1;
7042
7043                         key.objectid = bytenr;
7044                         key.type = BTRFS_EXTENT_ITEM_KEY;
7045                         key.offset = num_bytes;
7046
7047                         if (!is_data && skinny_metadata) {
7048                                 key.type = BTRFS_METADATA_ITEM_KEY;
7049                                 key.offset = owner_objectid;
7050                         }
7051
7052                         ret = btrfs_search_slot(trans, extent_root,
7053                                                 &key, path, -1, 1);
7054                         if (ret > 0 && skinny_metadata && path->slots[0]) {
7055                                 /*
7056                                  * Couldn't find our skinny metadata item,
7057                                  * see if we have ye olde extent item.
7058                                  */
7059                                 path->slots[0]--;
7060                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
7061                                                       path->slots[0]);
7062                                 if (key.objectid == bytenr &&
7063                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
7064                                     key.offset == num_bytes)
7065                                         ret = 0;
7066                         }
7067
7068                         if (ret > 0 && skinny_metadata) {
7069                                 skinny_metadata = false;
7070                                 key.objectid = bytenr;
7071                                 key.type = BTRFS_EXTENT_ITEM_KEY;
7072                                 key.offset = num_bytes;
7073                                 btrfs_release_path(path);
7074                                 ret = btrfs_search_slot(trans, extent_root,
7075                                                         &key, path, -1, 1);
7076                         }
7077
7078                         if (ret) {
7079                                 btrfs_err(info,
7080                                           "umm, got %d back from search, was looking for %llu",
7081                                           ret, bytenr);
7082                                 if (ret > 0)
7083                                         btrfs_print_leaf(path->nodes[0]);
7084                         }
7085                         if (ret < 0) {
7086                                 btrfs_abort_transaction(trans, ret);
7087                                 goto out;
7088                         }
7089                         extent_slot = path->slots[0];
7090                 }
7091         } else if (WARN_ON(ret == -ENOENT)) {
7092                 btrfs_print_leaf(path->nodes[0]);
7093                 btrfs_err(info,
7094                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
7095                         bytenr, parent, root_objectid, owner_objectid,
7096                         owner_offset);
7097                 btrfs_abort_transaction(trans, ret);
7098                 goto out;
7099         } else {
7100                 btrfs_abort_transaction(trans, ret);
7101                 goto out;
7102         }
7103
7104         leaf = path->nodes[0];
7105         item_size = btrfs_item_size_nr(leaf, extent_slot);
7106         if (unlikely(item_size < sizeof(*ei))) {
7107                 ret = -EINVAL;
7108                 btrfs_print_v0_err(info);
7109                 btrfs_abort_transaction(trans, ret);
7110                 goto out;
7111         }
7112         ei = btrfs_item_ptr(leaf, extent_slot,
7113                             struct btrfs_extent_item);
7114         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7115             key.type == BTRFS_EXTENT_ITEM_KEY) {
7116                 struct btrfs_tree_block_info *bi;
7117                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7118                 bi = (struct btrfs_tree_block_info *)(ei + 1);
7119                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7120         }
7121
7122         refs = btrfs_extent_refs(leaf, ei);
7123         if (refs < refs_to_drop) {
7124                 btrfs_err(info,
7125                           "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7126                           refs_to_drop, refs, bytenr);
7127                 ret = -EINVAL;
7128                 btrfs_abort_transaction(trans, ret);
7129                 goto out;
7130         }
7131         refs -= refs_to_drop;
7132
7133         if (refs > 0) {
7134                 if (extent_op)
7135                         __run_delayed_extent_op(extent_op, leaf, ei);
7136                 /*
7137                  * In the case of inline back ref, reference count will
7138                  * be updated by remove_extent_backref
7139                  */
7140                 if (iref) {
7141                         BUG_ON(!found_extent);
7142                 } else {
7143                         btrfs_set_extent_refs(leaf, ei, refs);
7144                         btrfs_mark_buffer_dirty(leaf);
7145                 }
7146                 if (found_extent) {
7147                         ret = remove_extent_backref(trans, path, iref,
7148                                                     refs_to_drop, is_data,
7149                                                     &last_ref);
7150                         if (ret) {
7151                                 btrfs_abort_transaction(trans, ret);
7152                                 goto out;
7153                         }
7154                 }
7155         } else {
7156                 if (found_extent) {
7157                         BUG_ON(is_data && refs_to_drop !=
7158                                extent_data_ref_count(path, iref));
7159                         if (iref) {
7160                                 BUG_ON(path->slots[0] != extent_slot);
7161                         } else {
7162                                 BUG_ON(path->slots[0] != extent_slot + 1);
7163                                 path->slots[0] = extent_slot;
7164                                 num_to_del = 2;
7165                         }
7166                 }
7167
7168                 last_ref = 1;
7169                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7170                                       num_to_del);
7171                 if (ret) {
7172                         btrfs_abort_transaction(trans, ret);
7173                         goto out;
7174                 }
7175                 btrfs_release_path(path);
7176
7177                 if (is_data) {
7178                         ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
7179                         if (ret) {
7180                                 btrfs_abort_transaction(trans, ret);
7181                                 goto out;
7182                         }
7183                 }
7184
7185                 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
7186                 if (ret) {
7187                         btrfs_abort_transaction(trans, ret);
7188                         goto out;
7189                 }
7190
7191                 ret = update_block_group(trans, info, bytenr, num_bytes, 0);
7192                 if (ret) {
7193                         btrfs_abort_transaction(trans, ret);
7194                         goto out;
7195                 }
7196         }
7197         btrfs_release_path(path);
7198
7199 out:
7200         btrfs_free_path(path);
7201         return ret;
7202 }
7203
7204 /*
7205  * when we free an block, it is possible (and likely) that we free the last
7206  * delayed ref for that extent as well.  This searches the delayed ref tree for
7207  * a given extent, and if there are no other delayed refs to be processed, it
7208  * removes it from the tree.
7209  */
7210 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7211                                       u64 bytenr)
7212 {
7213         struct btrfs_delayed_ref_head *head;
7214         struct btrfs_delayed_ref_root *delayed_refs;
7215         int ret = 0;
7216
7217         delayed_refs = &trans->transaction->delayed_refs;
7218         spin_lock(&delayed_refs->lock);
7219         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
7220         if (!head)
7221                 goto out_delayed_unlock;
7222
7223         spin_lock(&head->lock);
7224         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
7225                 goto out;
7226
7227         if (cleanup_extent_op(head) != NULL)
7228                 goto out;
7229
7230         /*
7231          * waiting for the lock here would deadlock.  If someone else has it
7232          * locked they are already in the process of dropping it anyway
7233          */
7234         if (!mutex_trylock(&head->mutex))
7235                 goto out;
7236
7237         btrfs_delete_ref_head(delayed_refs, head);
7238         head->processing = 0;
7239
7240         spin_unlock(&head->lock);
7241         spin_unlock(&delayed_refs->lock);
7242
7243         BUG_ON(head->extent_op);
7244         if (head->must_insert_reserved)
7245                 ret = 1;
7246
7247         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
7248         mutex_unlock(&head->mutex);
7249         btrfs_put_delayed_ref_head(head);
7250         return ret;
7251 out:
7252         spin_unlock(&head->lock);
7253
7254 out_delayed_unlock:
7255         spin_unlock(&delayed_refs->lock);
7256         return 0;
7257 }
7258
7259 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7260                            struct btrfs_root *root,
7261                            struct extent_buffer *buf,
7262                            u64 parent, int last_ref)
7263 {
7264         struct btrfs_fs_info *fs_info = root->fs_info;
7265         int pin = 1;
7266         int ret;
7267
7268         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7269                 int old_ref_mod, new_ref_mod;
7270
7271                 btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
7272                                    root->root_key.objectid,
7273                                    btrfs_header_level(buf), 0,
7274                                    BTRFS_DROP_DELAYED_REF);
7275                 ret = btrfs_add_delayed_tree_ref(trans, buf->start,
7276                                                  buf->len, parent,
7277                                                  root->root_key.objectid,
7278                                                  btrfs_header_level(buf),
7279                                                  BTRFS_DROP_DELAYED_REF, NULL,
7280                                                  &old_ref_mod, &new_ref_mod);
7281                 BUG_ON(ret); /* -ENOMEM */
7282                 pin = old_ref_mod >= 0 && new_ref_mod < 0;
7283         }
7284
7285         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7286                 struct btrfs_block_group_cache *cache;
7287
7288                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7289                         ret = check_ref_cleanup(trans, buf->start);
7290                         if (!ret)
7291                                 goto out;
7292                 }
7293
7294                 pin = 0;
7295                 cache = btrfs_lookup_block_group(fs_info, buf->start);
7296
7297                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7298                         pin_down_extent(fs_info, cache, buf->start,
7299                                         buf->len, 1);
7300                         btrfs_put_block_group(cache);
7301                         goto out;
7302                 }
7303
7304                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7305
7306                 btrfs_add_free_space(cache, buf->start, buf->len);
7307                 btrfs_free_reserved_bytes(cache, buf->len, 0);
7308                 btrfs_put_block_group(cache);
7309                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7310         }
7311 out:
7312         if (pin)
7313                 add_pinned_bytes(fs_info, buf->len, true,
7314                                  root->root_key.objectid);
7315
7316         if (last_ref) {
7317                 /*
7318                  * Deleting the buffer, clear the corrupt flag since it doesn't
7319                  * matter anymore.
7320                  */
7321                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7322         }
7323 }
7324
7325 /* Can return -ENOMEM */
7326 int btrfs_free_extent(struct btrfs_trans_handle *trans,
7327                       struct btrfs_root *root,
7328                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7329                       u64 owner, u64 offset)
7330 {
7331         struct btrfs_fs_info *fs_info = root->fs_info;
7332         int old_ref_mod, new_ref_mod;
7333         int ret;
7334
7335         if (btrfs_is_testing(fs_info))
7336                 return 0;
7337
7338         if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7339                 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7340                                    root_objectid, owner, offset,
7341                                    BTRFS_DROP_DELAYED_REF);
7342
7343         /*
7344          * tree log blocks never actually go into the extent allocation
7345          * tree, just update pinning info and exit early.
7346          */
7347         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7348                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7349                 /* unlocks the pinned mutex */
7350                 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7351                 old_ref_mod = new_ref_mod = 0;
7352                 ret = 0;
7353         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7354                 ret = btrfs_add_delayed_tree_ref(trans, bytenr,
7355                                                  num_bytes, parent,
7356                                                  root_objectid, (int)owner,
7357                                                  BTRFS_DROP_DELAYED_REF, NULL,
7358                                                  &old_ref_mod, &new_ref_mod);
7359         } else {
7360                 ret = btrfs_add_delayed_data_ref(trans, bytenr,
7361                                                  num_bytes, parent,
7362                                                  root_objectid, owner, offset,
7363                                                  0, BTRFS_DROP_DELAYED_REF,
7364                                                  &old_ref_mod, &new_ref_mod);
7365         }
7366
7367         if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
7368                 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
7369
7370                 add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
7371         }
7372
7373         return ret;
7374 }
7375
7376 /*
7377  * when we wait for progress in the block group caching, its because
7378  * our allocation attempt failed at least once.  So, we must sleep
7379  * and let some progress happen before we try again.
7380  *
7381  * This function will sleep at least once waiting for new free space to
7382  * show up, and then it will check the block group free space numbers
7383  * for our min num_bytes.  Another option is to have it go ahead
7384  * and look in the rbtree for a free extent of a given size, but this
7385  * is a good start.
7386  *
7387  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7388  * any of the information in this block group.
7389  */
7390 static noinline void
7391 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7392                                 u64 num_bytes)
7393 {
7394         struct btrfs_caching_control *caching_ctl;
7395
7396         caching_ctl = get_caching_control(cache);
7397         if (!caching_ctl)
7398                 return;
7399
7400         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7401                    (cache->free_space_ctl->free_space >= num_bytes));
7402
7403         put_caching_control(caching_ctl);
7404 }
7405
7406 static noinline int
7407 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7408 {
7409         struct btrfs_caching_control *caching_ctl;
7410         int ret = 0;
7411
7412         caching_ctl = get_caching_control(cache);
7413         if (!caching_ctl)
7414                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7415
7416         wait_event(caching_ctl->wait, block_group_cache_done(cache));
7417         if (cache->cached == BTRFS_CACHE_ERROR)
7418                 ret = -EIO;
7419         put_caching_control(caching_ctl);
7420         return ret;
7421 }
7422
7423 enum btrfs_loop_type {
7424         LOOP_CACHING_NOWAIT = 0,
7425         LOOP_CACHING_WAIT = 1,
7426         LOOP_ALLOC_CHUNK = 2,
7427         LOOP_NO_EMPTY_SIZE = 3,
7428 };
7429
7430 static inline void
7431 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7432                        int delalloc)
7433 {
7434         if (delalloc)
7435                 down_read(&cache->data_rwsem);
7436 }
7437
7438 static inline void
7439 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7440                        int delalloc)
7441 {
7442         btrfs_get_block_group(cache);
7443         if (delalloc)
7444                 down_read(&cache->data_rwsem);
7445 }
7446
7447 static struct btrfs_block_group_cache *
7448 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7449                    struct btrfs_free_cluster *cluster,
7450                    int delalloc)
7451 {
7452         struct btrfs_block_group_cache *used_bg = NULL;
7453
7454         spin_lock(&cluster->refill_lock);
7455         while (1) {
7456                 used_bg = cluster->block_group;
7457                 if (!used_bg)
7458                         return NULL;
7459
7460                 if (used_bg == block_group)
7461                         return used_bg;
7462
7463                 btrfs_get_block_group(used_bg);
7464
7465                 if (!delalloc)
7466                         return used_bg;
7467
7468                 if (down_read_trylock(&used_bg->data_rwsem))
7469                         return used_bg;
7470
7471                 spin_unlock(&cluster->refill_lock);
7472
7473                 /* We should only have one-level nested. */
7474                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7475
7476                 spin_lock(&cluster->refill_lock);
7477                 if (used_bg == cluster->block_group)
7478                         return used_bg;
7479
7480                 up_read(&used_bg->data_rwsem);
7481                 btrfs_put_block_group(used_bg);
7482         }
7483 }
7484
7485 static inline void
7486 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7487                          int delalloc)
7488 {
7489         if (delalloc)
7490                 up_read(&cache->data_rwsem);
7491         btrfs_put_block_group(cache);
7492 }
7493
7494 /*
7495  * Structure used internally for find_free_extent() function.  Wraps needed
7496  * parameters.
7497  */
7498 struct find_free_extent_ctl {
7499         /* Basic allocation info */
7500         u64 ram_bytes;
7501         u64 num_bytes;
7502         u64 empty_size;
7503         u64 flags;
7504         int delalloc;
7505
7506         /* Where to start the search inside the bg */
7507         u64 search_start;
7508
7509         /* For clustered allocation */
7510         u64 empty_cluster;
7511
7512         bool have_caching_bg;
7513         bool orig_have_caching_bg;
7514
7515         /* RAID index, converted from flags */
7516         int index;
7517
7518         /*
7519          * Current loop number, check find_free_extent_update_loop() for details
7520          */
7521         int loop;
7522
7523         /*
7524          * Whether we're refilling a cluster, if true we need to re-search
7525          * current block group but don't try to refill the cluster again.
7526          */
7527         bool retry_clustered;
7528
7529         /*
7530          * Whether we're updating free space cache, if true we need to re-search
7531          * current block group but don't try updating free space cache again.
7532          */
7533         bool retry_unclustered;
7534
7535         /* If current block group is cached */
7536         int cached;
7537
7538         /* Max contiguous hole found */
7539         u64 max_extent_size;
7540
7541         /* Total free space from free space cache, not always contiguous */
7542         u64 total_free_space;
7543
7544         /* Found result */
7545         u64 found_offset;
7546 };
7547
7548
7549 /*
7550  * Helper function for find_free_extent().
7551  *
7552  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
7553  * Return -EAGAIN to inform caller that we need to re-search this block group
7554  * Return >0 to inform caller that we find nothing
7555  * Return 0 means we have found a location and set ffe_ctl->found_offset.
7556  */
7557 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
7558                 struct btrfs_free_cluster *last_ptr,
7559                 struct find_free_extent_ctl *ffe_ctl,
7560                 struct btrfs_block_group_cache **cluster_bg_ret)
7561 {
7562         struct btrfs_fs_info *fs_info = bg->fs_info;
7563         struct btrfs_block_group_cache *cluster_bg;
7564         u64 aligned_cluster;
7565         u64 offset;
7566         int ret;
7567
7568         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
7569         if (!cluster_bg)
7570                 goto refill_cluster;
7571         if (cluster_bg != bg && (cluster_bg->ro ||
7572             !block_group_bits(cluster_bg, ffe_ctl->flags)))
7573                 goto release_cluster;
7574
7575         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
7576                         ffe_ctl->num_bytes, cluster_bg->key.objectid,
7577                         &ffe_ctl->max_extent_size);
7578         if (offset) {
7579                 /* We have a block, we're done */
7580                 spin_unlock(&last_ptr->refill_lock);
7581                 trace_btrfs_reserve_extent_cluster(cluster_bg,
7582                                 ffe_ctl->search_start, ffe_ctl->num_bytes);
7583                 *cluster_bg_ret = cluster_bg;
7584                 ffe_ctl->found_offset = offset;
7585                 return 0;
7586         }
7587         WARN_ON(last_ptr->block_group != cluster_bg);
7588
7589 release_cluster:
7590         /*
7591          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
7592          * lets just skip it and let the allocator find whatever block it can
7593          * find. If we reach this point, we will have tried the cluster
7594          * allocator plenty of times and not have found anything, so we are
7595          * likely way too fragmented for the clustering stuff to find anything.
7596          *
7597          * However, if the cluster is taken from the current block group,
7598          * release the cluster first, so that we stand a better chance of
7599          * succeeding in the unclustered allocation.
7600          */
7601         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
7602                 spin_unlock(&last_ptr->refill_lock);
7603                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7604                 return -ENOENT;
7605         }
7606
7607         /* This cluster didn't work out, free it and start over */
7608         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7609
7610         if (cluster_bg != bg)
7611                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7612
7613 refill_cluster:
7614         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
7615                 spin_unlock(&last_ptr->refill_lock);
7616                 return -ENOENT;
7617         }
7618
7619         aligned_cluster = max_t(u64,
7620                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
7621                         bg->full_stripe_len);
7622         ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
7623                         ffe_ctl->search_start, ffe_ctl->num_bytes,
7624                         aligned_cluster);
7625         if (ret == 0) {
7626                 /* Now pull our allocation out of this cluster */
7627                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
7628                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
7629                                 &ffe_ctl->max_extent_size);
7630                 if (offset) {
7631                         /* We found one, proceed */
7632                         spin_unlock(&last_ptr->refill_lock);
7633                         trace_btrfs_reserve_extent_cluster(bg,
7634                                         ffe_ctl->search_start,
7635                                         ffe_ctl->num_bytes);
7636                         ffe_ctl->found_offset = offset;
7637                         return 0;
7638                 }
7639         } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
7640                    !ffe_ctl->retry_clustered) {
7641                 spin_unlock(&last_ptr->refill_lock);
7642
7643                 ffe_ctl->retry_clustered = true;
7644                 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7645                                 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
7646                 return -EAGAIN;
7647         }
7648         /*
7649          * At this point we either didn't find a cluster or we weren't able to
7650          * allocate a block from our cluster.  Free the cluster we've been
7651          * trying to use, and go to the next block group.
7652          */
7653         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7654         spin_unlock(&last_ptr->refill_lock);
7655         return 1;
7656 }
7657
7658 /*
7659  * Return >0 to inform caller that we find nothing
7660  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
7661  * Return -EAGAIN to inform caller that we need to re-search this block group
7662  */
7663 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
7664                 struct btrfs_free_cluster *last_ptr,
7665                 struct find_free_extent_ctl *ffe_ctl)
7666 {
7667         u64 offset;
7668
7669         /*
7670          * We are doing an unclustered allocation, set the fragmented flag so
7671          * we don't bother trying to setup a cluster again until we get more
7672          * space.
7673          */
7674         if (unlikely(last_ptr)) {
7675                 spin_lock(&last_ptr->lock);
7676                 last_ptr->fragmented = 1;
7677                 spin_unlock(&last_ptr->lock);
7678         }
7679         if (ffe_ctl->cached) {
7680                 struct btrfs_free_space_ctl *free_space_ctl;
7681
7682                 free_space_ctl = bg->free_space_ctl;
7683                 spin_lock(&free_space_ctl->tree_lock);
7684                 if (free_space_ctl->free_space <
7685                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
7686                     ffe_ctl->empty_size) {
7687                         ffe_ctl->total_free_space = max_t(u64,
7688                                         ffe_ctl->total_free_space,
7689                                         free_space_ctl->free_space);
7690                         spin_unlock(&free_space_ctl->tree_lock);
7691                         return 1;
7692                 }
7693                 spin_unlock(&free_space_ctl->tree_lock);
7694         }
7695
7696         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
7697                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
7698                         &ffe_ctl->max_extent_size);
7699
7700         /*
7701          * If we didn't find a chunk, and we haven't failed on this block group
7702          * before, and this block group is in the middle of caching and we are
7703          * ok with waiting, then go ahead and wait for progress to be made, and
7704          * set @retry_unclustered to true.
7705          *
7706          * If @retry_unclustered is true then we've already waited on this
7707          * block group once and should move on to the next block group.
7708          */
7709         if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
7710             ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
7711                 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7712                                                 ffe_ctl->empty_size);
7713                 ffe_ctl->retry_unclustered = true;
7714                 return -EAGAIN;
7715         } else if (!offset) {
7716                 return 1;
7717         }
7718         ffe_ctl->found_offset = offset;
7719         return 0;
7720 }
7721
7722 /*
7723  * Return >0 means caller needs to re-search for free extent
7724  * Return 0 means we have the needed free extent.
7725  * Return <0 means we failed to locate any free extent.
7726  */
7727 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
7728                                         struct btrfs_free_cluster *last_ptr,
7729                                         struct btrfs_key *ins,
7730                                         struct find_free_extent_ctl *ffe_ctl,
7731                                         int full_search, bool use_cluster)
7732 {
7733         struct btrfs_root *root = fs_info->extent_root;
7734         int ret;
7735
7736         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
7737             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
7738                 ffe_ctl->orig_have_caching_bg = true;
7739
7740         if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
7741             ffe_ctl->have_caching_bg)
7742                 return 1;
7743
7744         if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
7745                 return 1;
7746
7747         if (ins->objectid) {
7748                 if (!use_cluster && last_ptr) {
7749                         spin_lock(&last_ptr->lock);
7750                         last_ptr->window_start = ins->objectid;
7751                         spin_unlock(&last_ptr->lock);
7752                 }
7753                 return 0;
7754         }
7755
7756         /*
7757          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7758          *                      caching kthreads as we move along
7759          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7760          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7761          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7762          *                     again
7763          */
7764         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
7765                 ffe_ctl->index = 0;
7766                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
7767                         /*
7768                          * We want to skip the LOOP_CACHING_WAIT step if we
7769                          * don't have any uncached bgs and we've already done a
7770                          * full search through.
7771                          */
7772                         if (ffe_ctl->orig_have_caching_bg || !full_search)
7773                                 ffe_ctl->loop = LOOP_CACHING_WAIT;
7774                         else
7775                                 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
7776                 } else {
7777                         ffe_ctl->loop++;
7778                 }
7779
7780                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
7781                         struct btrfs_trans_handle *trans;
7782                         int exist = 0;
7783
7784                         trans = current->journal_info;
7785                         if (trans)
7786                                 exist = 1;
7787                         else
7788                                 trans = btrfs_join_transaction(root);
7789
7790                         if (IS_ERR(trans)) {
7791                                 ret = PTR_ERR(trans);
7792                                 return ret;
7793                         }
7794
7795                         ret = do_chunk_alloc(trans, ffe_ctl->flags,
7796                                              CHUNK_ALLOC_FORCE);
7797
7798                         /*
7799                          * If we can't allocate a new chunk we've already looped
7800                          * through at least once, move on to the NO_EMPTY_SIZE
7801                          * case.
7802                          */
7803                         if (ret == -ENOSPC)
7804                                 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
7805
7806                         /* Do not bail out on ENOSPC since we can do more. */
7807                         if (ret < 0 && ret != -ENOSPC)
7808                                 btrfs_abort_transaction(trans, ret);
7809                         else
7810                                 ret = 0;
7811                         if (!exist)
7812                                 btrfs_end_transaction(trans);
7813                         if (ret)
7814                                 return ret;
7815                 }
7816
7817                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
7818                         /*
7819                          * Don't loop again if we already have no empty_size and
7820                          * no empty_cluster.
7821                          */
7822                         if (ffe_ctl->empty_size == 0 &&
7823                             ffe_ctl->empty_cluster == 0)
7824                                 return -ENOSPC;
7825                         ffe_ctl->empty_size = 0;
7826                         ffe_ctl->empty_cluster = 0;
7827                 }
7828                 return 1;
7829         }
7830         return -ENOSPC;
7831 }
7832
7833 /*
7834  * walks the btree of allocated extents and find a hole of a given size.
7835  * The key ins is changed to record the hole:
7836  * ins->objectid == start position
7837  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7838  * ins->offset == the size of the hole.
7839  * Any available blocks before search_start are skipped.
7840  *
7841  * If there is no suitable free space, we will record the max size of
7842  * the free space extent currently.
7843  *
7844  * The overall logic and call chain:
7845  *
7846  * find_free_extent()
7847  * |- Iterate through all block groups
7848  * |  |- Get a valid block group
7849  * |  |- Try to do clustered allocation in that block group
7850  * |  |- Try to do unclustered allocation in that block group
7851  * |  |- Check if the result is valid
7852  * |  |  |- If valid, then exit
7853  * |  |- Jump to next block group
7854  * |
7855  * |- Push harder to find free extents
7856  *    |- If not found, re-iterate all block groups
7857  */
7858 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7859                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7860                                 u64 hint_byte, struct btrfs_key *ins,
7861                                 u64 flags, int delalloc)
7862 {
7863         int ret = 0;
7864         struct btrfs_free_cluster *last_ptr = NULL;
7865         struct btrfs_block_group_cache *block_group = NULL;
7866         struct find_free_extent_ctl ffe_ctl = {0};
7867         struct btrfs_space_info *space_info;
7868         bool use_cluster = true;
7869         bool full_search = false;
7870
7871         WARN_ON(num_bytes < fs_info->sectorsize);
7872
7873         ffe_ctl.ram_bytes = ram_bytes;
7874         ffe_ctl.num_bytes = num_bytes;
7875         ffe_ctl.empty_size = empty_size;
7876         ffe_ctl.flags = flags;
7877         ffe_ctl.search_start = 0;
7878         ffe_ctl.retry_clustered = false;
7879         ffe_ctl.retry_unclustered = false;
7880         ffe_ctl.delalloc = delalloc;
7881         ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
7882         ffe_ctl.have_caching_bg = false;
7883         ffe_ctl.orig_have_caching_bg = false;
7884         ffe_ctl.found_offset = 0;
7885
7886         ins->type = BTRFS_EXTENT_ITEM_KEY;
7887         ins->objectid = 0;
7888         ins->offset = 0;
7889
7890         trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7891
7892         space_info = __find_space_info(fs_info, flags);
7893         if (!space_info) {
7894                 btrfs_err(fs_info, "No space info for %llu", flags);
7895                 return -ENOSPC;
7896         }
7897
7898         /*
7899          * If our free space is heavily fragmented we may not be able to make
7900          * big contiguous allocations, so instead of doing the expensive search
7901          * for free space, simply return ENOSPC with our max_extent_size so we
7902          * can go ahead and search for a more manageable chunk.
7903          *
7904          * If our max_extent_size is large enough for our allocation simply
7905          * disable clustering since we will likely not be able to find enough
7906          * space to create a cluster and induce latency trying.
7907          */
7908         if (unlikely(space_info->max_extent_size)) {
7909                 spin_lock(&space_info->lock);
7910                 if (space_info->max_extent_size &&
7911                     num_bytes > space_info->max_extent_size) {
7912                         ins->offset = space_info->max_extent_size;
7913                         spin_unlock(&space_info->lock);
7914                         return -ENOSPC;
7915                 } else if (space_info->max_extent_size) {
7916                         use_cluster = false;
7917                 }
7918                 spin_unlock(&space_info->lock);
7919         }
7920
7921         last_ptr = fetch_cluster_info(fs_info, space_info,
7922                                       &ffe_ctl.empty_cluster);
7923         if (last_ptr) {
7924                 spin_lock(&last_ptr->lock);
7925                 if (last_ptr->block_group)
7926                         hint_byte = last_ptr->window_start;
7927                 if (last_ptr->fragmented) {
7928                         /*
7929                          * We still set window_start so we can keep track of the
7930                          * last place we found an allocation to try and save
7931                          * some time.
7932                          */
7933                         hint_byte = last_ptr->window_start;
7934                         use_cluster = false;
7935                 }
7936                 spin_unlock(&last_ptr->lock);
7937         }
7938
7939         ffe_ctl.search_start = max(ffe_ctl.search_start,
7940                                    first_logical_byte(fs_info, 0));
7941         ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
7942         if (ffe_ctl.search_start == hint_byte) {
7943                 block_group = btrfs_lookup_block_group(fs_info,
7944                                                        ffe_ctl.search_start);
7945                 /*
7946                  * we don't want to use the block group if it doesn't match our
7947                  * allocation bits, or if its not cached.
7948                  *
7949                  * However if we are re-searching with an ideal block group
7950                  * picked out then we don't care that the block group is cached.
7951                  */
7952                 if (block_group && block_group_bits(block_group, flags) &&
7953                     block_group->cached != BTRFS_CACHE_NO) {
7954                         down_read(&space_info->groups_sem);
7955                         if (list_empty(&block_group->list) ||
7956                             block_group->ro) {
7957                                 /*
7958                                  * someone is removing this block group,
7959                                  * we can't jump into the have_block_group
7960                                  * target because our list pointers are not
7961                                  * valid
7962                                  */
7963                                 btrfs_put_block_group(block_group);
7964                                 up_read(&space_info->groups_sem);
7965                         } else {
7966                                 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
7967                                                 block_group->flags);
7968                                 btrfs_lock_block_group(block_group, delalloc);
7969                                 goto have_block_group;
7970                         }
7971                 } else if (block_group) {
7972                         btrfs_put_block_group(block_group);
7973                 }
7974         }
7975 search:
7976         ffe_ctl.have_caching_bg = false;
7977         if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
7978             ffe_ctl.index == 0)
7979                 full_search = true;
7980         down_read(&space_info->groups_sem);
7981         list_for_each_entry(block_group,
7982                             &space_info->block_groups[ffe_ctl.index], list) {
7983                 /* If the block group is read-only, we can skip it entirely. */
7984                 if (unlikely(block_group->ro))
7985                         continue;
7986
7987                 btrfs_grab_block_group(block_group, delalloc);
7988                 ffe_ctl.search_start = block_group->key.objectid;
7989
7990                 /*
7991                  * this can happen if we end up cycling through all the
7992                  * raid types, but we want to make sure we only allocate
7993                  * for the proper type.
7994                  */
7995                 if (!block_group_bits(block_group, flags)) {
7996                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
7997                                 BTRFS_BLOCK_GROUP_RAID1 |
7998                                 BTRFS_BLOCK_GROUP_RAID5 |
7999                                 BTRFS_BLOCK_GROUP_RAID6 |
8000                                 BTRFS_BLOCK_GROUP_RAID10;
8001
8002                         /*
8003                          * if they asked for extra copies and this block group
8004                          * doesn't provide them, bail.  This does allow us to
8005                          * fill raid0 from raid1.
8006                          */
8007                         if ((flags & extra) && !(block_group->flags & extra))
8008                                 goto loop;
8009                 }
8010
8011 have_block_group:
8012                 ffe_ctl.cached = block_group_cache_done(block_group);
8013                 if (unlikely(!ffe_ctl.cached)) {
8014                         ffe_ctl.have_caching_bg = true;
8015                         ret = cache_block_group(block_group, 0);
8016                         BUG_ON(ret < 0);
8017                         ret = 0;
8018                 }
8019
8020                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
8021                         goto loop;
8022
8023                 /*
8024                  * Ok we want to try and use the cluster allocator, so
8025                  * lets look there
8026                  */
8027                 if (last_ptr && use_cluster) {
8028                         struct btrfs_block_group_cache *cluster_bg = NULL;
8029
8030                         ret = find_free_extent_clustered(block_group, last_ptr,
8031                                                          &ffe_ctl, &cluster_bg);
8032
8033                         if (ret == 0) {
8034                                 if (cluster_bg && cluster_bg != block_group) {
8035                                         btrfs_release_block_group(block_group,
8036                                                                   delalloc);
8037                                         block_group = cluster_bg;
8038                                 }
8039                                 goto checks;
8040                         } else if (ret == -EAGAIN) {
8041                                 goto have_block_group;
8042                         } else if (ret > 0) {
8043                                 goto loop;
8044                         }
8045                         /* ret == -ENOENT case falls through */
8046                 }
8047
8048                 ret = find_free_extent_unclustered(block_group, last_ptr,
8049                                                    &ffe_ctl);
8050                 if (ret == -EAGAIN)
8051                         goto have_block_group;
8052                 else if (ret > 0)
8053                         goto loop;
8054                 /* ret == 0 case falls through */
8055 checks:
8056                 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
8057                                              fs_info->stripesize);
8058
8059                 /* move on to the next group */
8060                 if (ffe_ctl.search_start + num_bytes >
8061                     block_group->key.objectid + block_group->key.offset) {
8062                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8063                                              num_bytes);
8064                         goto loop;
8065                 }
8066
8067                 if (ffe_ctl.found_offset < ffe_ctl.search_start)
8068                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8069                                 ffe_ctl.search_start - ffe_ctl.found_offset);
8070
8071                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
8072                                 num_bytes, delalloc);
8073                 if (ret == -EAGAIN) {
8074                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8075                                              num_bytes);
8076                         goto loop;
8077                 }
8078                 btrfs_inc_block_group_reservations(block_group);
8079
8080                 /* we are all good, lets return */
8081                 ins->objectid = ffe_ctl.search_start;
8082                 ins->offset = num_bytes;
8083
8084                 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
8085                                            num_bytes);
8086                 btrfs_release_block_group(block_group, delalloc);
8087                 break;
8088 loop:
8089                 ffe_ctl.retry_clustered = false;
8090                 ffe_ctl.retry_unclustered = false;
8091                 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
8092                        ffe_ctl.index);
8093                 btrfs_release_block_group(block_group, delalloc);
8094                 cond_resched();
8095         }
8096         up_read(&space_info->groups_sem);
8097
8098         ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
8099                                            full_search, use_cluster);
8100         if (ret > 0)
8101                 goto search;
8102
8103         if (ret == -ENOSPC) {
8104                 /*
8105                  * Use ffe_ctl->total_free_space as fallback if we can't find
8106                  * any contiguous hole.
8107                  */
8108                 if (!ffe_ctl.max_extent_size)
8109                         ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
8110                 spin_lock(&space_info->lock);
8111                 space_info->max_extent_size = ffe_ctl.max_extent_size;
8112                 spin_unlock(&space_info->lock);
8113                 ins->offset = ffe_ctl.max_extent_size;
8114         }
8115         return ret;
8116 }
8117
8118 #define DUMP_BLOCK_RSV(fs_info, rsv_name)                               \
8119 do {                                                                    \
8120         struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
8121         spin_lock(&__rsv->lock);                                        \
8122         btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
8123                    __rsv->size, __rsv->reserved);                       \
8124         spin_unlock(&__rsv->lock);                                      \
8125 } while (0)
8126
8127 static void dump_space_info(struct btrfs_fs_info *fs_info,
8128                             struct btrfs_space_info *info, u64 bytes,
8129                             int dump_block_groups)
8130 {
8131         struct btrfs_block_group_cache *cache;
8132         int index = 0;
8133
8134         spin_lock(&info->lock);
8135         btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
8136                    info->flags,
8137                    info->total_bytes - btrfs_space_info_used(info, true),
8138                    info->full ? "" : "not ");
8139         btrfs_info(fs_info,
8140                 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
8141                 info->total_bytes, info->bytes_used, info->bytes_pinned,
8142                 info->bytes_reserved, info->bytes_may_use,
8143                 info->bytes_readonly);
8144         spin_unlock(&info->lock);
8145
8146         DUMP_BLOCK_RSV(fs_info, global_block_rsv);
8147         DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
8148         DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
8149         DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
8150         DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
8151
8152         if (!dump_block_groups)
8153                 return;
8154
8155         down_read(&info->groups_sem);
8156 again:
8157         list_for_each_entry(cache, &info->block_groups[index], list) {
8158                 spin_lock(&cache->lock);
8159                 btrfs_info(fs_info,
8160                         "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
8161                         cache->key.objectid, cache->key.offset,
8162                         btrfs_block_group_used(&cache->item), cache->pinned,
8163                         cache->reserved, cache->ro ? "[readonly]" : "");
8164                 btrfs_dump_free_space(cache, bytes);
8165                 spin_unlock(&cache->lock);
8166         }
8167         if (++index < BTRFS_NR_RAID_TYPES)
8168                 goto again;
8169         up_read(&info->groups_sem);
8170 }
8171
8172 /*
8173  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
8174  *                        hole that is at least as big as @num_bytes.
8175  *
8176  * @root           -    The root that will contain this extent
8177  *
8178  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
8179  *                      is used for accounting purposes. This value differs
8180  *                      from @num_bytes only in the case of compressed extents.
8181  *
8182  * @num_bytes      -    Number of bytes to allocate on-disk.
8183  *
8184  * @min_alloc_size -    Indicates the minimum amount of space that the
8185  *                      allocator should try to satisfy. In some cases
8186  *                      @num_bytes may be larger than what is required and if
8187  *                      the filesystem is fragmented then allocation fails.
8188  *                      However, the presence of @min_alloc_size gives a
8189  *                      chance to try and satisfy the smaller allocation.
8190  *
8191  * @empty_size     -    A hint that you plan on doing more COW. This is the
8192  *                      size in bytes the allocator should try to find free
8193  *                      next to the block it returns.  This is just a hint and
8194  *                      may be ignored by the allocator.
8195  *
8196  * @hint_byte      -    Hint to the allocator to start searching above the byte
8197  *                      address passed. It might be ignored.
8198  *
8199  * @ins            -    This key is modified to record the found hole. It will
8200  *                      have the following values:
8201  *                      ins->objectid == start position
8202  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
8203  *                      ins->offset == the size of the hole.
8204  *
8205  * @is_data        -    Boolean flag indicating whether an extent is
8206  *                      allocated for data (true) or metadata (false)
8207  *
8208  * @delalloc       -    Boolean flag indicating whether this allocation is for
8209  *                      delalloc or not. If 'true' data_rwsem of block groups
8210  *                      is going to be acquired.
8211  *
8212  *
8213  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
8214  * case -ENOSPC is returned then @ins->offset will contain the size of the
8215  * largest available hole the allocator managed to find.
8216  */
8217 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
8218                          u64 num_bytes, u64 min_alloc_size,
8219                          u64 empty_size, u64 hint_byte,
8220                          struct btrfs_key *ins, int is_data, int delalloc)
8221 {
8222         struct btrfs_fs_info *fs_info = root->fs_info;
8223         bool final_tried = num_bytes == min_alloc_size;
8224         u64 flags;
8225         int ret;
8226
8227         flags = get_alloc_profile_by_root(root, is_data);
8228 again:
8229         WARN_ON(num_bytes < fs_info->sectorsize);
8230         ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
8231                                hint_byte, ins, flags, delalloc);
8232         if (!ret && !is_data) {
8233                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
8234         } else if (ret == -ENOSPC) {
8235                 if (!final_tried && ins->offset) {
8236                         num_bytes = min(num_bytes >> 1, ins->offset);
8237                         num_bytes = round_down(num_bytes,
8238                                                fs_info->sectorsize);
8239                         num_bytes = max(num_bytes, min_alloc_size);
8240                         ram_bytes = num_bytes;
8241                         if (num_bytes == min_alloc_size)
8242                                 final_tried = true;
8243                         goto again;
8244                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8245                         struct btrfs_space_info *sinfo;
8246
8247                         sinfo = __find_space_info(fs_info, flags);
8248                         btrfs_err(fs_info,
8249                                   "allocation failed flags %llu, wanted %llu",
8250                                   flags, num_bytes);
8251                         if (sinfo)
8252                                 dump_space_info(fs_info, sinfo, num_bytes, 1);
8253                 }
8254         }
8255
8256         return ret;
8257 }
8258
8259 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8260                                         u64 start, u64 len,
8261                                         int pin, int delalloc)
8262 {
8263         struct btrfs_block_group_cache *cache;
8264         int ret = 0;
8265
8266         cache = btrfs_lookup_block_group(fs_info, start);
8267         if (!cache) {
8268                 btrfs_err(fs_info, "Unable to find block group for %llu",
8269                           start);
8270                 return -ENOSPC;
8271         }
8272
8273         if (pin)
8274                 pin_down_extent(fs_info, cache, start, len, 1);
8275         else {
8276                 if (btrfs_test_opt(fs_info, DISCARD))
8277                         ret = btrfs_discard_extent(fs_info, start, len, NULL);
8278                 btrfs_add_free_space(cache, start, len);
8279                 btrfs_free_reserved_bytes(cache, len, delalloc);
8280                 trace_btrfs_reserved_extent_free(fs_info, start, len);
8281         }
8282
8283         btrfs_put_block_group(cache);
8284         return ret;
8285 }
8286
8287 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8288                                u64 start, u64 len, int delalloc)
8289 {
8290         return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
8291 }
8292
8293 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
8294                                        u64 start, u64 len)
8295 {
8296         return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
8297 }
8298
8299 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8300                                       u64 parent, u64 root_objectid,
8301                                       u64 flags, u64 owner, u64 offset,
8302                                       struct btrfs_key *ins, int ref_mod)
8303 {
8304         struct btrfs_fs_info *fs_info = trans->fs_info;
8305         int ret;
8306         struct btrfs_extent_item *extent_item;
8307         struct btrfs_extent_inline_ref *iref;
8308         struct btrfs_path *path;
8309         struct extent_buffer *leaf;
8310         int type;
8311         u32 size;
8312
8313         if (parent > 0)
8314                 type = BTRFS_SHARED_DATA_REF_KEY;
8315         else
8316                 type = BTRFS_EXTENT_DATA_REF_KEY;
8317
8318         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8319
8320         path = btrfs_alloc_path();
8321         if (!path)
8322                 return -ENOMEM;
8323
8324         path->leave_spinning = 1;
8325         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8326                                       ins, size);
8327         if (ret) {
8328                 btrfs_free_path(path);
8329                 return ret;
8330         }
8331
8332         leaf = path->nodes[0];
8333         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8334                                      struct btrfs_extent_item);
8335         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8336         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8337         btrfs_set_extent_flags(leaf, extent_item,
8338                                flags | BTRFS_EXTENT_FLAG_DATA);
8339
8340         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8341         btrfs_set_extent_inline_ref_type(leaf, iref, type);
8342         if (parent > 0) {
8343                 struct btrfs_shared_data_ref *ref;
8344                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
8345                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8346                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8347         } else {
8348                 struct btrfs_extent_data_ref *ref;
8349                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8350                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8351                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8352                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8353                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8354         }
8355
8356         btrfs_mark_buffer_dirty(path->nodes[0]);
8357         btrfs_free_path(path);
8358
8359         ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
8360         if (ret)
8361                 return ret;
8362
8363         ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
8364         if (ret) { /* -ENOENT, logic error */
8365                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8366                         ins->objectid, ins->offset);
8367                 BUG();
8368         }
8369         trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8370         return ret;
8371 }
8372
8373 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8374                                      struct btrfs_delayed_ref_node *node,
8375                                      struct btrfs_delayed_extent_op *extent_op)
8376 {
8377         struct btrfs_fs_info *fs_info = trans->fs_info;
8378         int ret;
8379         struct btrfs_extent_item *extent_item;
8380         struct btrfs_key extent_key;
8381         struct btrfs_tree_block_info *block_info;
8382         struct btrfs_extent_inline_ref *iref;
8383         struct btrfs_path *path;
8384         struct extent_buffer *leaf;
8385         struct btrfs_delayed_tree_ref *ref;
8386         u32 size = sizeof(*extent_item) + sizeof(*iref);
8387         u64 num_bytes;
8388         u64 flags = extent_op->flags_to_set;
8389         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8390
8391         ref = btrfs_delayed_node_to_tree_ref(node);
8392
8393         extent_key.objectid = node->bytenr;
8394         if (skinny_metadata) {
8395                 extent_key.offset = ref->level;
8396                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
8397                 num_bytes = fs_info->nodesize;
8398         } else {
8399                 extent_key.offset = node->num_bytes;
8400                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
8401                 size += sizeof(*block_info);
8402                 num_bytes = node->num_bytes;
8403         }
8404
8405         path = btrfs_alloc_path();
8406         if (!path)
8407                 return -ENOMEM;
8408
8409         path->leave_spinning = 1;
8410         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8411                                       &extent_key, size);
8412         if (ret) {
8413                 btrfs_free_path(path);
8414                 return ret;
8415         }
8416
8417         leaf = path->nodes[0];
8418         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8419                                      struct btrfs_extent_item);
8420         btrfs_set_extent_refs(leaf, extent_item, 1);
8421         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8422         btrfs_set_extent_flags(leaf, extent_item,
8423                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8424
8425         if (skinny_metadata) {
8426                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8427         } else {
8428                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8429                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
8430                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
8431                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8432         }
8433
8434         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
8435                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8436                 btrfs_set_extent_inline_ref_type(leaf, iref,
8437                                                  BTRFS_SHARED_BLOCK_REF_KEY);
8438                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
8439         } else {
8440                 btrfs_set_extent_inline_ref_type(leaf, iref,
8441                                                  BTRFS_TREE_BLOCK_REF_KEY);
8442                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
8443         }
8444
8445         btrfs_mark_buffer_dirty(leaf);
8446         btrfs_free_path(path);
8447
8448         ret = remove_from_free_space_tree(trans, extent_key.objectid,
8449                                           num_bytes);
8450         if (ret)
8451                 return ret;
8452
8453         ret = update_block_group(trans, fs_info, extent_key.objectid,
8454                                  fs_info->nodesize, 1);
8455         if (ret) { /* -ENOENT, logic error */
8456                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8457                         extent_key.objectid, extent_key.offset);
8458                 BUG();
8459         }
8460
8461         trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
8462                                           fs_info->nodesize);
8463         return ret;
8464 }
8465
8466 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8467                                      struct btrfs_root *root, u64 owner,
8468                                      u64 offset, u64 ram_bytes,
8469                                      struct btrfs_key *ins)
8470 {
8471         int ret;
8472
8473         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8474
8475         btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8476                            root->root_key.objectid, owner, offset,
8477                            BTRFS_ADD_DELAYED_EXTENT);
8478
8479         ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
8480                                          ins->offset, 0,
8481                                          root->root_key.objectid, owner,
8482                                          offset, ram_bytes,
8483                                          BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8484         return ret;
8485 }
8486
8487 /*
8488  * this is used by the tree logging recovery code.  It records that
8489  * an extent has been allocated and makes sure to clear the free
8490  * space cache bits as well
8491  */
8492 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8493                                    u64 root_objectid, u64 owner, u64 offset,
8494                                    struct btrfs_key *ins)
8495 {
8496         struct btrfs_fs_info *fs_info = trans->fs_info;
8497         int ret;
8498         struct btrfs_block_group_cache *block_group;
8499         struct btrfs_space_info *space_info;
8500
8501         /*
8502          * Mixed block groups will exclude before processing the log so we only
8503          * need to do the exclude dance if this fs isn't mixed.
8504          */
8505         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8506                 ret = __exclude_logged_extent(fs_info, ins->objectid,
8507                                               ins->offset);
8508                 if (ret)
8509                         return ret;
8510         }
8511
8512         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8513         if (!block_group)
8514                 return -EINVAL;
8515
8516         space_info = block_group->space_info;
8517         spin_lock(&space_info->lock);
8518         spin_lock(&block_group->lock);
8519         space_info->bytes_reserved += ins->offset;
8520         block_group->reserved += ins->offset;
8521         spin_unlock(&block_group->lock);
8522         spin_unlock(&space_info->lock);
8523
8524         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
8525                                          offset, ins, 1);
8526         btrfs_put_block_group(block_group);
8527         return ret;
8528 }
8529
8530 static struct extent_buffer *
8531 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8532                       u64 bytenr, int level, u64 owner)
8533 {
8534         struct btrfs_fs_info *fs_info = root->fs_info;
8535         struct extent_buffer *buf;
8536
8537         buf = btrfs_find_create_tree_block(fs_info, bytenr);
8538         if (IS_ERR(buf))
8539                 return buf;
8540
8541         /*
8542          * Extra safety check in case the extent tree is corrupted and extent
8543          * allocator chooses to use a tree block which is already used and
8544          * locked.
8545          */
8546         if (buf->lock_owner == current->pid) {
8547                 btrfs_err_rl(fs_info,
8548 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
8549                         buf->start, btrfs_header_owner(buf), current->pid);
8550                 free_extent_buffer(buf);
8551                 return ERR_PTR(-EUCLEAN);
8552         }
8553
8554         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8555         btrfs_tree_lock(buf);
8556         btrfs_clean_tree_block(buf);
8557         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8558
8559         btrfs_set_lock_blocking_write(buf);
8560         set_extent_buffer_uptodate(buf);
8561
8562         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
8563         btrfs_set_header_level(buf, level);
8564         btrfs_set_header_bytenr(buf, buf->start);
8565         btrfs_set_header_generation(buf, trans->transid);
8566         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
8567         btrfs_set_header_owner(buf, owner);
8568         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
8569         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
8570         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8571                 buf->log_index = root->log_transid % 2;
8572                 /*
8573                  * we allow two log transactions at a time, use different
8574                  * EXTENT bit to differentiate dirty pages.
8575                  */
8576                 if (buf->log_index == 0)
8577                         set_extent_dirty(&root->dirty_log_pages, buf->start,
8578                                         buf->start + buf->len - 1, GFP_NOFS);
8579                 else
8580                         set_extent_new(&root->dirty_log_pages, buf->start,
8581                                         buf->start + buf->len - 1);
8582         } else {
8583                 buf->log_index = -1;
8584                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8585                          buf->start + buf->len - 1, GFP_NOFS);
8586         }
8587         trans->dirty = true;
8588         /* this returns a buffer locked for blocking */
8589         return buf;
8590 }
8591
8592 static struct btrfs_block_rsv *
8593 use_block_rsv(struct btrfs_trans_handle *trans,
8594               struct btrfs_root *root, u32 blocksize)
8595 {
8596         struct btrfs_fs_info *fs_info = root->fs_info;
8597         struct btrfs_block_rsv *block_rsv;
8598         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8599         int ret;
8600         bool global_updated = false;
8601
8602         block_rsv = get_block_rsv(trans, root);
8603
8604         if (unlikely(block_rsv->size == 0))
8605                 goto try_reserve;
8606 again:
8607         ret = block_rsv_use_bytes(block_rsv, blocksize);
8608         if (!ret)
8609                 return block_rsv;
8610
8611         if (block_rsv->failfast)
8612                 return ERR_PTR(ret);
8613
8614         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8615                 global_updated = true;
8616                 update_global_block_rsv(fs_info);
8617                 goto again;
8618         }
8619
8620         /*
8621          * The global reserve still exists to save us from ourselves, so don't
8622          * warn_on if we are short on our delayed refs reserve.
8623          */
8624         if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
8625             btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8626                 static DEFINE_RATELIMIT_STATE(_rs,
8627                                 DEFAULT_RATELIMIT_INTERVAL * 10,
8628                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
8629                 if (__ratelimit(&_rs))
8630                         WARN(1, KERN_DEBUG
8631                                 "BTRFS: block rsv returned %d\n", ret);
8632         }
8633 try_reserve:
8634         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8635                                      BTRFS_RESERVE_NO_FLUSH);
8636         if (!ret)
8637                 return block_rsv;
8638         /*
8639          * If we couldn't reserve metadata bytes try and use some from
8640          * the global reserve if its space type is the same as the global
8641          * reservation.
8642          */
8643         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8644             block_rsv->space_info == global_rsv->space_info) {
8645                 ret = block_rsv_use_bytes(global_rsv, blocksize);
8646                 if (!ret)
8647                         return global_rsv;
8648         }
8649         return ERR_PTR(ret);
8650 }
8651
8652 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8653                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
8654 {
8655         block_rsv_add_bytes(block_rsv, blocksize, false);
8656         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8657 }
8658
8659 /*
8660  * finds a free extent and does all the dirty work required for allocation
8661  * returns the tree buffer or an ERR_PTR on error.
8662  */
8663 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8664                                              struct btrfs_root *root,
8665                                              u64 parent, u64 root_objectid,
8666                                              const struct btrfs_disk_key *key,
8667                                              int level, u64 hint,
8668                                              u64 empty_size)
8669 {
8670         struct btrfs_fs_info *fs_info = root->fs_info;
8671         struct btrfs_key ins;
8672         struct btrfs_block_rsv *block_rsv;
8673         struct extent_buffer *buf;
8674         struct btrfs_delayed_extent_op *extent_op;
8675         u64 flags = 0;
8676         int ret;
8677         u32 blocksize = fs_info->nodesize;
8678         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8679
8680 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8681         if (btrfs_is_testing(fs_info)) {
8682                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8683                                             level, root_objectid);
8684                 if (!IS_ERR(buf))
8685                         root->alloc_bytenr += blocksize;
8686                 return buf;
8687         }
8688 #endif
8689
8690         block_rsv = use_block_rsv(trans, root, blocksize);
8691         if (IS_ERR(block_rsv))
8692                 return ERR_CAST(block_rsv);
8693
8694         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8695                                    empty_size, hint, &ins, 0, 0);
8696         if (ret)
8697                 goto out_unuse;
8698
8699         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8700                                     root_objectid);
8701         if (IS_ERR(buf)) {
8702                 ret = PTR_ERR(buf);
8703                 goto out_free_reserved;
8704         }
8705
8706         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8707                 if (parent == 0)
8708                         parent = ins.objectid;
8709                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8710         } else
8711                 BUG_ON(parent > 0);
8712
8713         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8714                 extent_op = btrfs_alloc_delayed_extent_op();
8715                 if (!extent_op) {
8716                         ret = -ENOMEM;
8717                         goto out_free_buf;
8718                 }
8719                 if (key)
8720                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8721                 else
8722                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8723                 extent_op->flags_to_set = flags;
8724                 extent_op->update_key = skinny_metadata ? false : true;
8725                 extent_op->update_flags = true;
8726                 extent_op->is_data = false;
8727                 extent_op->level = level;
8728
8729                 btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8730                                    root_objectid, level, 0,
8731                                    BTRFS_ADD_DELAYED_EXTENT);
8732                 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
8733                                                  ins.offset, parent,
8734                                                  root_objectid, level,
8735                                                  BTRFS_ADD_DELAYED_EXTENT,
8736                                                  extent_op, NULL, NULL);
8737                 if (ret)
8738                         goto out_free_delayed;
8739         }
8740         return buf;
8741
8742 out_free_delayed:
8743         btrfs_free_delayed_extent_op(extent_op);
8744 out_free_buf:
8745         free_extent_buffer(buf);
8746 out_free_reserved:
8747         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8748 out_unuse:
8749         unuse_block_rsv(fs_info, block_rsv, blocksize);
8750         return ERR_PTR(ret);
8751 }
8752
8753 struct walk_control {
8754         u64 refs[BTRFS_MAX_LEVEL];
8755         u64 flags[BTRFS_MAX_LEVEL];
8756         struct btrfs_key update_progress;
8757         struct btrfs_key drop_progress;
8758         int drop_level;
8759         int stage;
8760         int level;
8761         int shared_level;
8762         int update_ref;
8763         int keep_locks;
8764         int reada_slot;
8765         int reada_count;
8766         int restarted;
8767 };
8768
8769 #define DROP_REFERENCE  1
8770 #define UPDATE_BACKREF  2
8771
8772 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8773                                      struct btrfs_root *root,
8774                                      struct walk_control *wc,
8775                                      struct btrfs_path *path)
8776 {
8777         struct btrfs_fs_info *fs_info = root->fs_info;
8778         u64 bytenr;
8779         u64 generation;
8780         u64 refs;
8781         u64 flags;
8782         u32 nritems;
8783         struct btrfs_key key;
8784         struct extent_buffer *eb;
8785         int ret;
8786         int slot;
8787         int nread = 0;
8788
8789         if (path->slots[wc->level] < wc->reada_slot) {
8790                 wc->reada_count = wc->reada_count * 2 / 3;
8791                 wc->reada_count = max(wc->reada_count, 2);
8792         } else {
8793                 wc->reada_count = wc->reada_count * 3 / 2;
8794                 wc->reada_count = min_t(int, wc->reada_count,
8795                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8796         }
8797
8798         eb = path->nodes[wc->level];
8799         nritems = btrfs_header_nritems(eb);
8800
8801         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8802                 if (nread >= wc->reada_count)
8803                         break;
8804
8805                 cond_resched();
8806                 bytenr = btrfs_node_blockptr(eb, slot);
8807                 generation = btrfs_node_ptr_generation(eb, slot);
8808
8809                 if (slot == path->slots[wc->level])
8810                         goto reada;
8811
8812                 if (wc->stage == UPDATE_BACKREF &&
8813                     generation <= root->root_key.offset)
8814                         continue;
8815
8816                 /* We don't lock the tree block, it's OK to be racy here */
8817                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8818                                                wc->level - 1, 1, &refs,
8819                                                &flags);
8820                 /* We don't care about errors in readahead. */
8821                 if (ret < 0)
8822                         continue;
8823                 BUG_ON(refs == 0);
8824
8825                 if (wc->stage == DROP_REFERENCE) {
8826                         if (refs == 1)
8827                                 goto reada;
8828
8829                         if (wc->level == 1 &&
8830                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8831                                 continue;
8832                         if (!wc->update_ref ||
8833                             generation <= root->root_key.offset)
8834                                 continue;
8835                         btrfs_node_key_to_cpu(eb, &key, slot);
8836                         ret = btrfs_comp_cpu_keys(&key,
8837                                                   &wc->update_progress);
8838                         if (ret < 0)
8839                                 continue;
8840                 } else {
8841                         if (wc->level == 1 &&
8842                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8843                                 continue;
8844                 }
8845 reada:
8846                 readahead_tree_block(fs_info, bytenr);
8847                 nread++;
8848         }
8849         wc->reada_slot = slot;
8850 }
8851
8852 /*
8853  * helper to process tree block while walking down the tree.
8854  *
8855  * when wc->stage == UPDATE_BACKREF, this function updates
8856  * back refs for pointers in the block.
8857  *
8858  * NOTE: return value 1 means we should stop walking down.
8859  */
8860 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8861                                    struct btrfs_root *root,
8862                                    struct btrfs_path *path,
8863                                    struct walk_control *wc, int lookup_info)
8864 {
8865         struct btrfs_fs_info *fs_info = root->fs_info;
8866         int level = wc->level;
8867         struct extent_buffer *eb = path->nodes[level];
8868         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8869         int ret;
8870
8871         if (wc->stage == UPDATE_BACKREF &&
8872             btrfs_header_owner(eb) != root->root_key.objectid)
8873                 return 1;
8874
8875         /*
8876          * when reference count of tree block is 1, it won't increase
8877          * again. once full backref flag is set, we never clear it.
8878          */
8879         if (lookup_info &&
8880             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8881              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8882                 BUG_ON(!path->locks[level]);
8883                 ret = btrfs_lookup_extent_info(trans, fs_info,
8884                                                eb->start, level, 1,
8885                                                &wc->refs[level],
8886                                                &wc->flags[level]);
8887                 BUG_ON(ret == -ENOMEM);
8888                 if (ret)
8889                         return ret;
8890                 BUG_ON(wc->refs[level] == 0);
8891         }
8892
8893         if (wc->stage == DROP_REFERENCE) {
8894                 if (wc->refs[level] > 1)
8895                         return 1;
8896
8897                 if (path->locks[level] && !wc->keep_locks) {
8898                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8899                         path->locks[level] = 0;
8900                 }
8901                 return 0;
8902         }
8903
8904         /* wc->stage == UPDATE_BACKREF */
8905         if (!(wc->flags[level] & flag)) {
8906                 BUG_ON(!path->locks[level]);
8907                 ret = btrfs_inc_ref(trans, root, eb, 1);
8908                 BUG_ON(ret); /* -ENOMEM */
8909                 ret = btrfs_dec_ref(trans, root, eb, 0);
8910                 BUG_ON(ret); /* -ENOMEM */
8911                 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8912                                                   eb->len, flag,
8913                                                   btrfs_header_level(eb), 0);
8914                 BUG_ON(ret); /* -ENOMEM */
8915                 wc->flags[level] |= flag;
8916         }
8917
8918         /*
8919          * the block is shared by multiple trees, so it's not good to
8920          * keep the tree lock
8921          */
8922         if (path->locks[level] && level > 0) {
8923                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8924                 path->locks[level] = 0;
8925         }
8926         return 0;
8927 }
8928
8929 /*
8930  * This is used to verify a ref exists for this root to deal with a bug where we
8931  * would have a drop_progress key that hadn't been updated properly.
8932  */
8933 static int check_ref_exists(struct btrfs_trans_handle *trans,
8934                             struct btrfs_root *root, u64 bytenr, u64 parent,
8935                             int level)
8936 {
8937         struct btrfs_path *path;
8938         struct btrfs_extent_inline_ref *iref;
8939         int ret;
8940
8941         path = btrfs_alloc_path();
8942         if (!path)
8943                 return -ENOMEM;
8944
8945         ret = lookup_extent_backref(trans, path, &iref, bytenr,
8946                                     root->fs_info->nodesize, parent,
8947                                     root->root_key.objectid, level, 0);
8948         btrfs_free_path(path);
8949         if (ret == -ENOENT)
8950                 return 0;
8951         if (ret < 0)
8952                 return ret;
8953         return 1;
8954 }
8955
8956 /*
8957  * helper to process tree block pointer.
8958  *
8959  * when wc->stage == DROP_REFERENCE, this function checks
8960  * reference count of the block pointed to. if the block
8961  * is shared and we need update back refs for the subtree
8962  * rooted at the block, this function changes wc->stage to
8963  * UPDATE_BACKREF. if the block is shared and there is no
8964  * need to update back, this function drops the reference
8965  * to the block.
8966  *
8967  * NOTE: return value 1 means we should stop walking down.
8968  */
8969 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8970                                  struct btrfs_root *root,
8971                                  struct btrfs_path *path,
8972                                  struct walk_control *wc, int *lookup_info)
8973 {
8974         struct btrfs_fs_info *fs_info = root->fs_info;
8975         u64 bytenr;
8976         u64 generation;
8977         u64 parent;
8978         struct btrfs_key key;
8979         struct btrfs_key first_key;
8980         struct extent_buffer *next;
8981         int level = wc->level;
8982         int reada = 0;
8983         int ret = 0;
8984         bool need_account = false;
8985
8986         generation = btrfs_node_ptr_generation(path->nodes[level],
8987                                                path->slots[level]);
8988         /*
8989          * if the lower level block was created before the snapshot
8990          * was created, we know there is no need to update back refs
8991          * for the subtree
8992          */
8993         if (wc->stage == UPDATE_BACKREF &&
8994             generation <= root->root_key.offset) {
8995                 *lookup_info = 1;
8996                 return 1;
8997         }
8998
8999         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
9000         btrfs_node_key_to_cpu(path->nodes[level], &first_key,
9001                               path->slots[level]);
9002
9003         next = find_extent_buffer(fs_info, bytenr);
9004         if (!next) {
9005                 next = btrfs_find_create_tree_block(fs_info, bytenr);
9006                 if (IS_ERR(next))
9007                         return PTR_ERR(next);
9008
9009                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
9010                                                level - 1);
9011                 reada = 1;
9012         }
9013         btrfs_tree_lock(next);
9014         btrfs_set_lock_blocking_write(next);
9015
9016         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
9017                                        &wc->refs[level - 1],
9018                                        &wc->flags[level - 1]);
9019         if (ret < 0)
9020                 goto out_unlock;
9021
9022         if (unlikely(wc->refs[level - 1] == 0)) {
9023                 btrfs_err(fs_info, "Missing references.");
9024                 ret = -EIO;
9025                 goto out_unlock;
9026         }
9027         *lookup_info = 0;
9028
9029         if (wc->stage == DROP_REFERENCE) {
9030                 if (wc->refs[level - 1] > 1) {
9031                         need_account = true;
9032                         if (level == 1 &&
9033                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
9034                                 goto skip;
9035
9036                         if (!wc->update_ref ||
9037                             generation <= root->root_key.offset)
9038                                 goto skip;
9039
9040                         btrfs_node_key_to_cpu(path->nodes[level], &key,
9041                                               path->slots[level]);
9042                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
9043                         if (ret < 0)
9044                                 goto skip;
9045
9046                         wc->stage = UPDATE_BACKREF;
9047                         wc->shared_level = level - 1;
9048                 }
9049         } else {
9050                 if (level == 1 &&
9051                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
9052                         goto skip;
9053         }
9054
9055         if (!btrfs_buffer_uptodate(next, generation, 0)) {
9056                 btrfs_tree_unlock(next);
9057                 free_extent_buffer(next);
9058                 next = NULL;
9059                 *lookup_info = 1;
9060         }
9061
9062         if (!next) {
9063                 if (reada && level == 1)
9064                         reada_walk_down(trans, root, wc, path);
9065                 next = read_tree_block(fs_info, bytenr, generation, level - 1,
9066                                        &first_key);
9067                 if (IS_ERR(next)) {
9068                         return PTR_ERR(next);
9069                 } else if (!extent_buffer_uptodate(next)) {
9070                         free_extent_buffer(next);
9071                         return -EIO;
9072                 }
9073                 btrfs_tree_lock(next);
9074                 btrfs_set_lock_blocking_write(next);
9075         }
9076
9077         level--;
9078         ASSERT(level == btrfs_header_level(next));
9079         if (level != btrfs_header_level(next)) {
9080                 btrfs_err(root->fs_info, "mismatched level");
9081                 ret = -EIO;
9082                 goto out_unlock;
9083         }
9084         path->nodes[level] = next;
9085         path->slots[level] = 0;
9086         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9087         wc->level = level;
9088         if (wc->level == 1)
9089                 wc->reada_slot = 0;
9090         return 0;
9091 skip:
9092         wc->refs[level - 1] = 0;
9093         wc->flags[level - 1] = 0;
9094         if (wc->stage == DROP_REFERENCE) {
9095                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
9096                         parent = path->nodes[level]->start;
9097                 } else {
9098                         ASSERT(root->root_key.objectid ==
9099                                btrfs_header_owner(path->nodes[level]));
9100                         if (root->root_key.objectid !=
9101                             btrfs_header_owner(path->nodes[level])) {
9102                                 btrfs_err(root->fs_info,
9103                                                 "mismatched block owner");
9104                                 ret = -EIO;
9105                                 goto out_unlock;
9106                         }
9107                         parent = 0;
9108                 }
9109
9110                 /*
9111                  * If we had a drop_progress we need to verify the refs are set
9112                  * as expected.  If we find our ref then we know that from here
9113                  * on out everything should be correct, and we can clear the
9114                  * ->restarted flag.
9115                  */
9116                 if (wc->restarted) {
9117                         ret = check_ref_exists(trans, root, bytenr, parent,
9118                                                level - 1);
9119                         if (ret < 0)
9120                                 goto out_unlock;
9121                         if (ret == 0)
9122                                 goto no_delete;
9123                         ret = 0;
9124                         wc->restarted = 0;
9125                 }
9126
9127                 /*
9128                  * Reloc tree doesn't contribute to qgroup numbers, and we have
9129                  * already accounted them at merge time (replace_path),
9130                  * thus we could skip expensive subtree trace here.
9131                  */
9132                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
9133                     need_account) {
9134                         ret = btrfs_qgroup_trace_subtree(trans, next,
9135                                                          generation, level - 1);
9136                         if (ret) {
9137                                 btrfs_err_rl(fs_info,
9138                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
9139                                              ret);
9140                         }
9141                 }
9142
9143                 /*
9144                  * We need to update the next key in our walk control so we can
9145                  * update the drop_progress key accordingly.  We don't care if
9146                  * find_next_key doesn't find a key because that means we're at
9147                  * the end and are going to clean up now.
9148                  */
9149                 wc->drop_level = level;
9150                 find_next_key(path, level, &wc->drop_progress);
9151
9152                 ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
9153                                         parent, root->root_key.objectid,
9154                                         level - 1, 0);
9155                 if (ret)
9156                         goto out_unlock;
9157         }
9158 no_delete:
9159         *lookup_info = 1;
9160         ret = 1;
9161
9162 out_unlock:
9163         btrfs_tree_unlock(next);
9164         free_extent_buffer(next);
9165
9166         return ret;
9167 }
9168
9169 /*
9170  * helper to process tree block while walking up the tree.
9171  *
9172  * when wc->stage == DROP_REFERENCE, this function drops
9173  * reference count on the block.
9174  *
9175  * when wc->stage == UPDATE_BACKREF, this function changes
9176  * wc->stage back to DROP_REFERENCE if we changed wc->stage
9177  * to UPDATE_BACKREF previously while processing the block.
9178  *
9179  * NOTE: return value 1 means we should stop walking up.
9180  */
9181 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
9182                                  struct btrfs_root *root,
9183                                  struct btrfs_path *path,
9184                                  struct walk_control *wc)
9185 {
9186         struct btrfs_fs_info *fs_info = root->fs_info;
9187         int ret;
9188         int level = wc->level;
9189         struct extent_buffer *eb = path->nodes[level];
9190         u64 parent = 0;
9191
9192         if (wc->stage == UPDATE_BACKREF) {
9193                 BUG_ON(wc->shared_level < level);
9194                 if (level < wc->shared_level)
9195                         goto out;
9196
9197                 ret = find_next_key(path, level + 1, &wc->update_progress);
9198                 if (ret > 0)
9199                         wc->update_ref = 0;
9200
9201                 wc->stage = DROP_REFERENCE;
9202                 wc->shared_level = -1;
9203                 path->slots[level] = 0;
9204
9205                 /*
9206                  * check reference count again if the block isn't locked.
9207                  * we should start walking down the tree again if reference
9208                  * count is one.
9209                  */
9210                 if (!path->locks[level]) {
9211                         BUG_ON(level == 0);
9212                         btrfs_tree_lock(eb);
9213                         btrfs_set_lock_blocking_write(eb);
9214                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9215
9216                         ret = btrfs_lookup_extent_info(trans, fs_info,
9217                                                        eb->start, level, 1,
9218                                                        &wc->refs[level],
9219                                                        &wc->flags[level]);
9220                         if (ret < 0) {
9221                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
9222                                 path->locks[level] = 0;
9223                                 return ret;
9224                         }
9225                         BUG_ON(wc->refs[level] == 0);
9226                         if (wc->refs[level] == 1) {
9227                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
9228                                 path->locks[level] = 0;
9229                                 return 1;
9230                         }
9231                 }
9232         }
9233
9234         /* wc->stage == DROP_REFERENCE */
9235         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
9236
9237         if (wc->refs[level] == 1) {
9238                 if (level == 0) {
9239                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9240                                 ret = btrfs_dec_ref(trans, root, eb, 1);
9241                         else
9242                                 ret = btrfs_dec_ref(trans, root, eb, 0);
9243                         BUG_ON(ret); /* -ENOMEM */
9244                         ret = btrfs_qgroup_trace_leaf_items(trans, eb);
9245                         if (ret) {
9246                                 btrfs_err_rl(fs_info,
9247                                              "error %d accounting leaf items. Quota is out of sync, rescan required.",
9248                                              ret);
9249                         }
9250                 }
9251                 /* make block locked assertion in btrfs_clean_tree_block happy */
9252                 if (!path->locks[level] &&
9253                     btrfs_header_generation(eb) == trans->transid) {
9254                         btrfs_tree_lock(eb);
9255                         btrfs_set_lock_blocking_write(eb);
9256                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9257                 }
9258                 btrfs_clean_tree_block(eb);
9259         }
9260
9261         if (eb == root->node) {
9262                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9263                         parent = eb->start;
9264                 else if (root->root_key.objectid != btrfs_header_owner(eb))
9265                         goto owner_mismatch;
9266         } else {
9267                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9268                         parent = path->nodes[level + 1]->start;
9269                 else if (root->root_key.objectid !=
9270                          btrfs_header_owner(path->nodes[level + 1]))
9271                         goto owner_mismatch;
9272         }
9273
9274         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9275 out:
9276         wc->refs[level] = 0;
9277         wc->flags[level] = 0;
9278         return 0;
9279
9280 owner_mismatch:
9281         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
9282                      btrfs_header_owner(eb), root->root_key.objectid);
9283         return -EUCLEAN;
9284 }
9285
9286 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9287                                    struct btrfs_root *root,
9288                                    struct btrfs_path *path,
9289                                    struct walk_control *wc)
9290 {
9291         int level = wc->level;
9292         int lookup_info = 1;
9293         int ret;
9294
9295         while (level >= 0) {
9296                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
9297                 if (ret > 0)
9298                         break;
9299
9300                 if (level == 0)
9301                         break;
9302
9303                 if (path->slots[level] >=
9304                     btrfs_header_nritems(path->nodes[level]))
9305                         break;
9306
9307                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
9308                 if (ret > 0) {
9309                         path->slots[level]++;
9310                         continue;
9311                 } else if (ret < 0)
9312                         return ret;
9313                 level = wc->level;
9314         }
9315         return 0;
9316 }
9317
9318 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9319                                  struct btrfs_root *root,
9320                                  struct btrfs_path *path,
9321                                  struct walk_control *wc, int max_level)
9322 {
9323         int level = wc->level;
9324         int ret;
9325
9326         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9327         while (level < max_level && path->nodes[level]) {
9328                 wc->level = level;
9329                 if (path->slots[level] + 1 <
9330                     btrfs_header_nritems(path->nodes[level])) {
9331                         path->slots[level]++;
9332                         return 0;
9333                 } else {
9334                         ret = walk_up_proc(trans, root, path, wc);
9335                         if (ret > 0)
9336                                 return 0;
9337                         if (ret < 0)
9338                                 return ret;
9339
9340                         if (path->locks[level]) {
9341                                 btrfs_tree_unlock_rw(path->nodes[level],
9342                                                      path->locks[level]);
9343                                 path->locks[level] = 0;
9344                         }
9345                         free_extent_buffer(path->nodes[level]);
9346                         path->nodes[level] = NULL;
9347                         level++;
9348                 }
9349         }
9350         return 1;
9351 }
9352
9353 /*
9354  * drop a subvolume tree.
9355  *
9356  * this function traverses the tree freeing any blocks that only
9357  * referenced by the tree.
9358  *
9359  * when a shared tree block is found. this function decreases its
9360  * reference count by one. if update_ref is true, this function
9361  * also make sure backrefs for the shared block and all lower level
9362  * blocks are properly updated.
9363  *
9364  * If called with for_reloc == 0, may exit early with -EAGAIN
9365  */
9366 int btrfs_drop_snapshot(struct btrfs_root *root,
9367                          struct btrfs_block_rsv *block_rsv, int update_ref,
9368                          int for_reloc)
9369 {
9370         struct btrfs_fs_info *fs_info = root->fs_info;
9371         struct btrfs_path *path;
9372         struct btrfs_trans_handle *trans;
9373         struct btrfs_root *tree_root = fs_info->tree_root;
9374         struct btrfs_root_item *root_item = &root->root_item;
9375         struct walk_control *wc;
9376         struct btrfs_key key;
9377         int err = 0;
9378         int ret;
9379         int level;
9380         bool root_dropped = false;
9381
9382         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
9383
9384         path = btrfs_alloc_path();
9385         if (!path) {
9386                 err = -ENOMEM;
9387                 goto out;
9388         }
9389
9390         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9391         if (!wc) {
9392                 btrfs_free_path(path);
9393                 err = -ENOMEM;
9394                 goto out;
9395         }
9396
9397         trans = btrfs_start_transaction(tree_root, 0);
9398         if (IS_ERR(trans)) {
9399                 err = PTR_ERR(trans);
9400                 goto out_free;
9401         }
9402
9403         err = btrfs_run_delayed_items(trans);
9404         if (err)
9405                 goto out_end_trans;
9406
9407         if (block_rsv)
9408                 trans->block_rsv = block_rsv;
9409
9410         /*
9411          * This will help us catch people modifying the fs tree while we're
9412          * dropping it.  It is unsafe to mess with the fs tree while it's being
9413          * dropped as we unlock the root node and parent nodes as we walk down
9414          * the tree, assuming nothing will change.  If something does change
9415          * then we'll have stale information and drop references to blocks we've
9416          * already dropped.
9417          */
9418         set_bit(BTRFS_ROOT_DELETING, &root->state);
9419         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9420                 level = btrfs_header_level(root->node);
9421                 path->nodes[level] = btrfs_lock_root_node(root);
9422                 btrfs_set_lock_blocking_write(path->nodes[level]);
9423                 path->slots[level] = 0;
9424                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9425                 memset(&wc->update_progress, 0,
9426                        sizeof(wc->update_progress));
9427         } else {
9428                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9429                 memcpy(&wc->update_progress, &key,
9430                        sizeof(wc->update_progress));
9431
9432                 level = root_item->drop_level;
9433                 BUG_ON(level == 0);
9434                 path->lowest_level = level;
9435                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9436                 path->lowest_level = 0;
9437                 if (ret < 0) {
9438                         err = ret;
9439                         goto out_end_trans;
9440                 }
9441                 WARN_ON(ret > 0);
9442
9443                 /*
9444                  * unlock our path, this is safe because only this
9445                  * function is allowed to delete this snapshot
9446                  */
9447                 btrfs_unlock_up_safe(path, 0);
9448
9449                 level = btrfs_header_level(root->node);
9450                 while (1) {
9451                         btrfs_tree_lock(path->nodes[level]);
9452                         btrfs_set_lock_blocking_write(path->nodes[level]);
9453                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9454
9455                         ret = btrfs_lookup_extent_info(trans, fs_info,
9456                                                 path->nodes[level]->start,
9457                                                 level, 1, &wc->refs[level],
9458                                                 &wc->flags[level]);
9459                         if (ret < 0) {
9460                                 err = ret;
9461                                 goto out_end_trans;
9462                         }
9463                         BUG_ON(wc->refs[level] == 0);
9464
9465                         if (level == root_item->drop_level)
9466                                 break;
9467
9468                         btrfs_tree_unlock(path->nodes[level]);
9469                         path->locks[level] = 0;
9470                         WARN_ON(wc->refs[level] != 1);
9471                         level--;
9472                 }
9473         }
9474
9475         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
9476         wc->level = level;
9477         wc->shared_level = -1;
9478         wc->stage = DROP_REFERENCE;
9479         wc->update_ref = update_ref;
9480         wc->keep_locks = 0;
9481         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9482
9483         while (1) {
9484
9485                 ret = walk_down_tree(trans, root, path, wc);
9486                 if (ret < 0) {
9487                         err = ret;
9488                         break;
9489                 }
9490
9491                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9492                 if (ret < 0) {
9493                         err = ret;
9494                         break;
9495                 }
9496
9497                 if (ret > 0) {
9498                         BUG_ON(wc->stage != DROP_REFERENCE);
9499                         break;
9500                 }
9501
9502                 if (wc->stage == DROP_REFERENCE) {
9503                         wc->drop_level = wc->level;
9504                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
9505                                               &wc->drop_progress,
9506                                               path->slots[wc->drop_level]);
9507                 }
9508                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
9509                                       &wc->drop_progress);
9510                 root_item->drop_level = wc->drop_level;
9511
9512                 BUG_ON(wc->level == 0);
9513                 if (btrfs_should_end_transaction(trans) ||
9514                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9515                         ret = btrfs_update_root(trans, tree_root,
9516                                                 &root->root_key,
9517                                                 root_item);
9518                         if (ret) {
9519                                 btrfs_abort_transaction(trans, ret);
9520                                 err = ret;
9521                                 goto out_end_trans;
9522                         }
9523
9524                         btrfs_end_transaction_throttle(trans);
9525                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9526                                 btrfs_debug(fs_info,
9527                                             "drop snapshot early exit");
9528                                 err = -EAGAIN;
9529                                 goto out_free;
9530                         }
9531
9532                         trans = btrfs_start_transaction(tree_root, 0);
9533                         if (IS_ERR(trans)) {
9534                                 err = PTR_ERR(trans);
9535                                 goto out_free;
9536                         }
9537                         if (block_rsv)
9538                                 trans->block_rsv = block_rsv;
9539                 }
9540         }
9541         btrfs_release_path(path);
9542         if (err)
9543                 goto out_end_trans;
9544
9545         ret = btrfs_del_root(trans, &root->root_key);
9546         if (ret) {
9547                 btrfs_abort_transaction(trans, ret);
9548                 err = ret;
9549                 goto out_end_trans;
9550         }
9551
9552         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9553                 ret = btrfs_find_root(tree_root, &root->root_key, path,
9554                                       NULL, NULL);
9555                 if (ret < 0) {
9556                         btrfs_abort_transaction(trans, ret);
9557                         err = ret;
9558                         goto out_end_trans;
9559                 } else if (ret > 0) {
9560                         /* if we fail to delete the orphan item this time
9561                          * around, it'll get picked up the next time.
9562                          *
9563                          * The most common failure here is just -ENOENT.
9564                          */
9565                         btrfs_del_orphan_item(trans, tree_root,
9566                                               root->root_key.objectid);
9567                 }
9568         }
9569
9570         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9571                 btrfs_add_dropped_root(trans, root);
9572         } else {
9573                 free_extent_buffer(root->node);
9574                 free_extent_buffer(root->commit_root);
9575                 btrfs_put_fs_root(root);
9576         }
9577         root_dropped = true;
9578 out_end_trans:
9579         btrfs_end_transaction_throttle(trans);
9580 out_free:
9581         kfree(wc);
9582         btrfs_free_path(path);
9583 out:
9584         /*
9585          * So if we need to stop dropping the snapshot for whatever reason we
9586          * need to make sure to add it back to the dead root list so that we
9587          * keep trying to do the work later.  This also cleans up roots if we
9588          * don't have it in the radix (like when we recover after a power fail
9589          * or unmount) so we don't leak memory.
9590          */
9591         if (!for_reloc && !root_dropped)
9592                 btrfs_add_dead_root(root);
9593         if (err && err != -EAGAIN)
9594                 btrfs_handle_fs_error(fs_info, err, NULL);
9595         return err;
9596 }
9597
9598 /*
9599  * drop subtree rooted at tree block 'node'.
9600  *
9601  * NOTE: this function will unlock and release tree block 'node'
9602  * only used by relocation code
9603  */
9604 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9605                         struct btrfs_root *root,
9606                         struct extent_buffer *node,
9607                         struct extent_buffer *parent)
9608 {
9609         struct btrfs_fs_info *fs_info = root->fs_info;
9610         struct btrfs_path *path;
9611         struct walk_control *wc;
9612         int level;
9613         int parent_level;
9614         int ret = 0;
9615         int wret;
9616
9617         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9618
9619         path = btrfs_alloc_path();
9620         if (!path)
9621                 return -ENOMEM;
9622
9623         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9624         if (!wc) {
9625                 btrfs_free_path(path);
9626                 return -ENOMEM;
9627         }
9628
9629         btrfs_assert_tree_locked(parent);
9630         parent_level = btrfs_header_level(parent);
9631         extent_buffer_get(parent);
9632         path->nodes[parent_level] = parent;
9633         path->slots[parent_level] = btrfs_header_nritems(parent);
9634
9635         btrfs_assert_tree_locked(node);
9636         level = btrfs_header_level(node);
9637         path->nodes[level] = node;
9638         path->slots[level] = 0;
9639         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9640
9641         wc->refs[parent_level] = 1;
9642         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9643         wc->level = level;
9644         wc->shared_level = -1;
9645         wc->stage = DROP_REFERENCE;
9646         wc->update_ref = 0;
9647         wc->keep_locks = 1;
9648         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9649
9650         while (1) {
9651                 wret = walk_down_tree(trans, root, path, wc);
9652                 if (wret < 0) {
9653                         ret = wret;
9654                         break;
9655                 }
9656
9657                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9658                 if (wret < 0)
9659                         ret = wret;
9660                 if (wret != 0)
9661                         break;
9662         }
9663
9664         kfree(wc);
9665         btrfs_free_path(path);
9666         return ret;
9667 }
9668
9669 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9670 {
9671         u64 num_devices;
9672         u64 stripped;
9673
9674         /*
9675          * if restripe for this chunk_type is on pick target profile and
9676          * return, otherwise do the usual balance
9677          */
9678         stripped = get_restripe_target(fs_info, flags);
9679         if (stripped)
9680                 return extended_to_chunk(stripped);
9681
9682         num_devices = fs_info->fs_devices->rw_devices;
9683
9684         stripped = BTRFS_BLOCK_GROUP_RAID0 |
9685                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9686                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9687
9688         if (num_devices == 1) {
9689                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9690                 stripped = flags & ~stripped;
9691
9692                 /* turn raid0 into single device chunks */
9693                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9694                         return stripped;
9695
9696                 /* turn mirroring into duplication */
9697                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9698                              BTRFS_BLOCK_GROUP_RAID10))
9699                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9700         } else {
9701                 /* they already had raid on here, just return */
9702                 if (flags & stripped)
9703                         return flags;
9704
9705                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9706                 stripped = flags & ~stripped;
9707
9708                 /* switch duplicated blocks with raid1 */
9709                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9710                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9711
9712                 /* this is drive concat, leave it alone */
9713         }
9714
9715         return flags;
9716 }
9717
9718 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9719 {
9720         struct btrfs_space_info *sinfo = cache->space_info;
9721         u64 num_bytes;
9722         u64 sinfo_used;
9723         u64 min_allocable_bytes;
9724         int ret = -ENOSPC;
9725
9726         /*
9727          * We need some metadata space and system metadata space for
9728          * allocating chunks in some corner cases until we force to set
9729          * it to be readonly.
9730          */
9731         if ((sinfo->flags &
9732              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9733             !force)
9734                 min_allocable_bytes = SZ_1M;
9735         else
9736                 min_allocable_bytes = 0;
9737
9738         spin_lock(&sinfo->lock);
9739         spin_lock(&cache->lock);
9740
9741         if (cache->ro) {
9742                 cache->ro++;
9743                 ret = 0;
9744                 goto out;
9745         }
9746
9747         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9748                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9749         sinfo_used = btrfs_space_info_used(sinfo, true);
9750
9751         if (sinfo_used + num_bytes + min_allocable_bytes <=
9752             sinfo->total_bytes) {
9753                 sinfo->bytes_readonly += num_bytes;
9754                 cache->ro++;
9755                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9756                 ret = 0;
9757         }
9758 out:
9759         spin_unlock(&cache->lock);
9760         spin_unlock(&sinfo->lock);
9761         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
9762                 btrfs_info(cache->fs_info,
9763                         "unable to make block group %llu ro",
9764                         cache->key.objectid);
9765                 btrfs_info(cache->fs_info,
9766                         "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
9767                         sinfo_used, num_bytes, min_allocable_bytes);
9768                 dump_space_info(cache->fs_info, cache->space_info, 0, 0);
9769         }
9770         return ret;
9771 }
9772
9773 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9774
9775 {
9776         struct btrfs_fs_info *fs_info = cache->fs_info;
9777         struct btrfs_trans_handle *trans;
9778         u64 alloc_flags;
9779         int ret;
9780
9781 again:
9782         trans = btrfs_join_transaction(fs_info->extent_root);
9783         if (IS_ERR(trans))
9784                 return PTR_ERR(trans);
9785
9786         /*
9787          * we're not allowed to set block groups readonly after the dirty
9788          * block groups cache has started writing.  If it already started,
9789          * back off and let this transaction commit
9790          */
9791         mutex_lock(&fs_info->ro_block_group_mutex);
9792         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9793                 u64 transid = trans->transid;
9794
9795                 mutex_unlock(&fs_info->ro_block_group_mutex);
9796                 btrfs_end_transaction(trans);
9797
9798                 ret = btrfs_wait_for_commit(fs_info, transid);
9799                 if (ret)
9800                         return ret;
9801                 goto again;
9802         }
9803
9804         /*
9805          * if we are changing raid levels, try to allocate a corresponding
9806          * block group with the new raid level.
9807          */
9808         alloc_flags = update_block_group_flags(fs_info, cache->flags);
9809         if (alloc_flags != cache->flags) {
9810                 ret = do_chunk_alloc(trans, alloc_flags,
9811                                      CHUNK_ALLOC_FORCE);
9812                 /*
9813                  * ENOSPC is allowed here, we may have enough space
9814                  * already allocated at the new raid level to
9815                  * carry on
9816                  */
9817                 if (ret == -ENOSPC)
9818                         ret = 0;
9819                 if (ret < 0)
9820                         goto out;
9821         }
9822
9823         ret = inc_block_group_ro(cache, 0);
9824         if (!ret)
9825                 goto out;
9826         alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9827         ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9828         if (ret < 0)
9829                 goto out;
9830         ret = inc_block_group_ro(cache, 0);
9831 out:
9832         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9833                 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9834                 mutex_lock(&fs_info->chunk_mutex);
9835                 check_system_chunk(trans, alloc_flags);
9836                 mutex_unlock(&fs_info->chunk_mutex);
9837         }
9838         mutex_unlock(&fs_info->ro_block_group_mutex);
9839
9840         btrfs_end_transaction(trans);
9841         return ret;
9842 }
9843
9844 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9845 {
9846         u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9847
9848         return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9849 }
9850
9851 /*
9852  * helper to account the unused space of all the readonly block group in the
9853  * space_info. takes mirrors into account.
9854  */
9855 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9856 {
9857         struct btrfs_block_group_cache *block_group;
9858         u64 free_bytes = 0;
9859         int factor;
9860
9861         /* It's df, we don't care if it's racy */
9862         if (list_empty(&sinfo->ro_bgs))
9863                 return 0;
9864
9865         spin_lock(&sinfo->lock);
9866         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9867                 spin_lock(&block_group->lock);
9868
9869                 if (!block_group->ro) {
9870                         spin_unlock(&block_group->lock);
9871                         continue;
9872                 }
9873
9874                 factor = btrfs_bg_type_to_factor(block_group->flags);
9875                 free_bytes += (block_group->key.offset -
9876                                btrfs_block_group_used(&block_group->item)) *
9877                                factor;
9878
9879                 spin_unlock(&block_group->lock);
9880         }
9881         spin_unlock(&sinfo->lock);
9882
9883         return free_bytes;
9884 }
9885
9886 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9887 {
9888         struct btrfs_space_info *sinfo = cache->space_info;
9889         u64 num_bytes;
9890
9891         BUG_ON(!cache->ro);
9892
9893         spin_lock(&sinfo->lock);
9894         spin_lock(&cache->lock);
9895         if (!--cache->ro) {
9896                 num_bytes = cache->key.offset - cache->reserved -
9897                             cache->pinned - cache->bytes_super -
9898                             btrfs_block_group_used(&cache->item);
9899                 sinfo->bytes_readonly -= num_bytes;
9900                 list_del_init(&cache->ro_list);
9901         }
9902         spin_unlock(&cache->lock);
9903         spin_unlock(&sinfo->lock);
9904 }
9905
9906 /*
9907  * Checks to see if it's even possible to relocate this block group.
9908  *
9909  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9910  * ok to go ahead and try.
9911  */
9912 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9913 {
9914         struct btrfs_block_group_cache *block_group;
9915         struct btrfs_space_info *space_info;
9916         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9917         struct btrfs_device *device;
9918         u64 min_free;
9919         u64 dev_min = 1;
9920         u64 dev_nr = 0;
9921         u64 target;
9922         int debug;
9923         int index;
9924         int full = 0;
9925         int ret = 0;
9926
9927         debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9928
9929         block_group = btrfs_lookup_block_group(fs_info, bytenr);
9930
9931         /* odd, couldn't find the block group, leave it alone */
9932         if (!block_group) {
9933                 if (debug)
9934                         btrfs_warn(fs_info,
9935                                    "can't find block group for bytenr %llu",
9936                                    bytenr);
9937                 return -1;
9938         }
9939
9940         min_free = btrfs_block_group_used(&block_group->item);
9941
9942         /* no bytes used, we're good */
9943         if (!min_free)
9944                 goto out;
9945
9946         space_info = block_group->space_info;
9947         spin_lock(&space_info->lock);
9948
9949         full = space_info->full;
9950
9951         /*
9952          * if this is the last block group we have in this space, we can't
9953          * relocate it unless we're able to allocate a new chunk below.
9954          *
9955          * Otherwise, we need to make sure we have room in the space to handle
9956          * all of the extents from this block group.  If we can, we're good
9957          */
9958         if ((space_info->total_bytes != block_group->key.offset) &&
9959             (btrfs_space_info_used(space_info, false) + min_free <
9960              space_info->total_bytes)) {
9961                 spin_unlock(&space_info->lock);
9962                 goto out;
9963         }
9964         spin_unlock(&space_info->lock);
9965
9966         /*
9967          * ok we don't have enough space, but maybe we have free space on our
9968          * devices to allocate new chunks for relocation, so loop through our
9969          * alloc devices and guess if we have enough space.  if this block
9970          * group is going to be restriped, run checks against the target
9971          * profile instead of the current one.
9972          */
9973         ret = -1;
9974
9975         /*
9976          * index:
9977          *      0: raid10
9978          *      1: raid1
9979          *      2: dup
9980          *      3: raid0
9981          *      4: single
9982          */
9983         target = get_restripe_target(fs_info, block_group->flags);
9984         if (target) {
9985                 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9986         } else {
9987                 /*
9988                  * this is just a balance, so if we were marked as full
9989                  * we know there is no space for a new chunk
9990                  */
9991                 if (full) {
9992                         if (debug)
9993                                 btrfs_warn(fs_info,
9994                                            "no space to alloc new chunk for block group %llu",
9995                                            block_group->key.objectid);
9996                         goto out;
9997                 }
9998
9999                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
10000         }
10001
10002         if (index == BTRFS_RAID_RAID10) {
10003                 dev_min = 4;
10004                 /* Divide by 2 */
10005                 min_free >>= 1;
10006         } else if (index == BTRFS_RAID_RAID1) {
10007                 dev_min = 2;
10008         } else if (index == BTRFS_RAID_DUP) {
10009                 /* Multiply by 2 */
10010                 min_free <<= 1;
10011         } else if (index == BTRFS_RAID_RAID0) {
10012                 dev_min = fs_devices->rw_devices;
10013                 min_free = div64_u64(min_free, dev_min);
10014         }
10015
10016         mutex_lock(&fs_info->chunk_mutex);
10017         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
10018                 u64 dev_offset;
10019
10020                 /*
10021                  * check to make sure we can actually find a chunk with enough
10022                  * space to fit our block group in.
10023                  */
10024                 if (device->total_bytes > device->bytes_used + min_free &&
10025                     !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
10026                         ret = find_free_dev_extent(device, min_free,
10027                                                    &dev_offset, NULL);
10028                         if (!ret)
10029                                 dev_nr++;
10030
10031                         if (dev_nr >= dev_min)
10032                                 break;
10033
10034                         ret = -1;
10035                 }
10036         }
10037         if (debug && ret == -1)
10038                 btrfs_warn(fs_info,
10039                            "no space to allocate a new chunk for block group %llu",
10040                            block_group->key.objectid);
10041         mutex_unlock(&fs_info->chunk_mutex);
10042 out:
10043         btrfs_put_block_group(block_group);
10044         return ret;
10045 }
10046
10047 static int find_first_block_group(struct btrfs_fs_info *fs_info,
10048                                   struct btrfs_path *path,
10049                                   struct btrfs_key *key)
10050 {
10051         struct btrfs_root *root = fs_info->extent_root;
10052         int ret = 0;
10053         struct btrfs_key found_key;
10054         struct extent_buffer *leaf;
10055         struct btrfs_block_group_item bg;
10056         u64 flags;
10057         int slot;
10058
10059         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
10060         if (ret < 0)
10061                 goto out;
10062
10063         while (1) {
10064                 slot = path->slots[0];
10065                 leaf = path->nodes[0];
10066                 if (slot >= btrfs_header_nritems(leaf)) {
10067                         ret = btrfs_next_leaf(root, path);
10068                         if (ret == 0)
10069                                 continue;
10070                         if (ret < 0)
10071                                 goto out;
10072                         break;
10073                 }
10074                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
10075
10076                 if (found_key.objectid >= key->objectid &&
10077                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
10078                         struct extent_map_tree *em_tree;
10079                         struct extent_map *em;
10080
10081                         em_tree = &root->fs_info->mapping_tree.map_tree;
10082                         read_lock(&em_tree->lock);
10083                         em = lookup_extent_mapping(em_tree, found_key.objectid,
10084                                                    found_key.offset);
10085                         read_unlock(&em_tree->lock);
10086                         if (!em) {
10087                                 btrfs_err(fs_info,
10088                         "logical %llu len %llu found bg but no related chunk",
10089                                           found_key.objectid, found_key.offset);
10090                                 ret = -ENOENT;
10091                         } else if (em->start != found_key.objectid ||
10092                                    em->len != found_key.offset) {
10093                                 btrfs_err(fs_info,
10094                 "block group %llu len %llu mismatch with chunk %llu len %llu",
10095                                           found_key.objectid, found_key.offset,
10096                                           em->start, em->len);
10097                                 ret = -EUCLEAN;
10098                         } else {
10099                                 read_extent_buffer(leaf, &bg,
10100                                         btrfs_item_ptr_offset(leaf, slot),
10101                                         sizeof(bg));
10102                                 flags = btrfs_block_group_flags(&bg) &
10103                                         BTRFS_BLOCK_GROUP_TYPE_MASK;
10104
10105                                 if (flags != (em->map_lookup->type &
10106                                               BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10107                                         btrfs_err(fs_info,
10108 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
10109                                                 found_key.objectid,
10110                                                 found_key.offset, flags,
10111                                                 (BTRFS_BLOCK_GROUP_TYPE_MASK &
10112                                                  em->map_lookup->type));
10113                                         ret = -EUCLEAN;
10114                                 } else {
10115                                         ret = 0;
10116                                 }
10117                         }
10118                         free_extent_map(em);
10119                         goto out;
10120                 }
10121                 path->slots[0]++;
10122         }
10123 out:
10124         return ret;
10125 }
10126
10127 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
10128 {
10129         struct btrfs_block_group_cache *block_group;
10130         u64 last = 0;
10131
10132         while (1) {
10133                 struct inode *inode;
10134
10135                 block_group = btrfs_lookup_first_block_group(info, last);
10136                 while (block_group) {
10137                         wait_block_group_cache_done(block_group);
10138                         spin_lock(&block_group->lock);
10139                         if (block_group->iref)
10140                                 break;
10141                         spin_unlock(&block_group->lock);
10142                         block_group = next_block_group(info, block_group);
10143                 }
10144                 if (!block_group) {
10145                         if (last == 0)
10146                                 break;
10147                         last = 0;
10148                         continue;
10149                 }
10150
10151                 inode = block_group->inode;
10152                 block_group->iref = 0;
10153                 block_group->inode = NULL;
10154                 spin_unlock(&block_group->lock);
10155                 ASSERT(block_group->io_ctl.inode == NULL);
10156                 iput(inode);
10157                 last = block_group->key.objectid + block_group->key.offset;
10158                 btrfs_put_block_group(block_group);
10159         }
10160 }
10161
10162 /*
10163  * Must be called only after stopping all workers, since we could have block
10164  * group caching kthreads running, and therefore they could race with us if we
10165  * freed the block groups before stopping them.
10166  */
10167 int btrfs_free_block_groups(struct btrfs_fs_info *info)
10168 {
10169         struct btrfs_block_group_cache *block_group;
10170         struct btrfs_space_info *space_info;
10171         struct btrfs_caching_control *caching_ctl;
10172         struct rb_node *n;
10173
10174         down_write(&info->commit_root_sem);
10175         while (!list_empty(&info->caching_block_groups)) {
10176                 caching_ctl = list_entry(info->caching_block_groups.next,
10177                                          struct btrfs_caching_control, list);
10178                 list_del(&caching_ctl->list);
10179                 put_caching_control(caching_ctl);
10180         }
10181         up_write(&info->commit_root_sem);
10182
10183         spin_lock(&info->unused_bgs_lock);
10184         while (!list_empty(&info->unused_bgs)) {
10185                 block_group = list_first_entry(&info->unused_bgs,
10186                                                struct btrfs_block_group_cache,
10187                                                bg_list);
10188                 list_del_init(&block_group->bg_list);
10189                 btrfs_put_block_group(block_group);
10190         }
10191         spin_unlock(&info->unused_bgs_lock);
10192
10193         spin_lock(&info->block_group_cache_lock);
10194         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
10195                 block_group = rb_entry(n, struct btrfs_block_group_cache,
10196                                        cache_node);
10197                 rb_erase(&block_group->cache_node,
10198                          &info->block_group_cache_tree);
10199                 RB_CLEAR_NODE(&block_group->cache_node);
10200                 spin_unlock(&info->block_group_cache_lock);
10201
10202                 down_write(&block_group->space_info->groups_sem);
10203                 list_del(&block_group->list);
10204                 up_write(&block_group->space_info->groups_sem);
10205
10206                 /*
10207                  * We haven't cached this block group, which means we could
10208                  * possibly have excluded extents on this block group.
10209                  */
10210                 if (block_group->cached == BTRFS_CACHE_NO ||
10211                     block_group->cached == BTRFS_CACHE_ERROR)
10212                         free_excluded_extents(block_group);
10213
10214                 btrfs_remove_free_space_cache(block_group);
10215                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
10216                 ASSERT(list_empty(&block_group->dirty_list));
10217                 ASSERT(list_empty(&block_group->io_list));
10218                 ASSERT(list_empty(&block_group->bg_list));
10219                 ASSERT(atomic_read(&block_group->count) == 1);
10220                 btrfs_put_block_group(block_group);
10221
10222                 spin_lock(&info->block_group_cache_lock);
10223         }
10224         spin_unlock(&info->block_group_cache_lock);
10225
10226         /* now that all the block groups are freed, go through and
10227          * free all the space_info structs.  This is only called during
10228          * the final stages of unmount, and so we know nobody is
10229          * using them.  We call synchronize_rcu() once before we start,
10230          * just to be on the safe side.
10231          */
10232         synchronize_rcu();
10233
10234         release_global_block_rsv(info);
10235
10236         while (!list_empty(&info->space_info)) {
10237                 int i;
10238
10239                 space_info = list_entry(info->space_info.next,
10240                                         struct btrfs_space_info,
10241                                         list);
10242
10243                 /*
10244                  * Do not hide this behind enospc_debug, this is actually
10245                  * important and indicates a real bug if this happens.
10246                  */
10247                 if (WARN_ON(space_info->bytes_pinned > 0 ||
10248                             space_info->bytes_reserved > 0 ||
10249                             space_info->bytes_may_use > 0))
10250                         dump_space_info(info, space_info, 0, 0);
10251                 list_del(&space_info->list);
10252                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10253                         struct kobject *kobj;
10254                         kobj = space_info->block_group_kobjs[i];
10255                         space_info->block_group_kobjs[i] = NULL;
10256                         if (kobj) {
10257                                 kobject_del(kobj);
10258                                 kobject_put(kobj);
10259                         }
10260                 }
10261                 kobject_del(&space_info->kobj);
10262                 kobject_put(&space_info->kobj);
10263         }
10264         return 0;
10265 }
10266
10267 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
10268 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
10269 {
10270         struct btrfs_space_info *space_info;
10271         struct raid_kobject *rkobj;
10272         LIST_HEAD(list);
10273         int index;
10274         int ret = 0;
10275
10276         spin_lock(&fs_info->pending_raid_kobjs_lock);
10277         list_splice_init(&fs_info->pending_raid_kobjs, &list);
10278         spin_unlock(&fs_info->pending_raid_kobjs_lock);
10279
10280         list_for_each_entry(rkobj, &list, list) {
10281                 space_info = __find_space_info(fs_info, rkobj->flags);
10282                 index = btrfs_bg_flags_to_raid_index(rkobj->flags);
10283
10284                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10285                                   "%s", get_raid_name(index));
10286                 if (ret) {
10287                         kobject_put(&rkobj->kobj);
10288                         break;
10289                 }
10290         }
10291         if (ret)
10292                 btrfs_warn(fs_info,
10293                            "failed to add kobject for block cache, ignoring");
10294 }
10295
10296 static void link_block_group(struct btrfs_block_group_cache *cache)
10297 {
10298         struct btrfs_space_info *space_info = cache->space_info;
10299         struct btrfs_fs_info *fs_info = cache->fs_info;
10300         int index = btrfs_bg_flags_to_raid_index(cache->flags);
10301         bool first = false;
10302
10303         down_write(&space_info->groups_sem);
10304         if (list_empty(&space_info->block_groups[index]))
10305                 first = true;
10306         list_add_tail(&cache->list, &space_info->block_groups[index]);
10307         up_write(&space_info->groups_sem);
10308
10309         if (first) {
10310                 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10311                 if (!rkobj) {
10312                         btrfs_warn(cache->fs_info,
10313                                 "couldn't alloc memory for raid level kobject");
10314                         return;
10315                 }
10316                 rkobj->flags = cache->flags;
10317                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10318
10319                 spin_lock(&fs_info->pending_raid_kobjs_lock);
10320                 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
10321                 spin_unlock(&fs_info->pending_raid_kobjs_lock);
10322                 space_info->block_group_kobjs[index] = &rkobj->kobj;
10323         }
10324 }
10325
10326 static struct btrfs_block_group_cache *
10327 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10328                                u64 start, u64 size)
10329 {
10330         struct btrfs_block_group_cache *cache;
10331
10332         cache = kzalloc(sizeof(*cache), GFP_NOFS);
10333         if (!cache)
10334                 return NULL;
10335
10336         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10337                                         GFP_NOFS);
10338         if (!cache->free_space_ctl) {
10339                 kfree(cache);
10340                 return NULL;
10341         }
10342
10343         cache->key.objectid = start;
10344         cache->key.offset = size;
10345         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10346
10347         cache->fs_info = fs_info;
10348         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
10349         set_free_space_tree_thresholds(cache);
10350
10351         atomic_set(&cache->count, 1);
10352         spin_lock_init(&cache->lock);
10353         init_rwsem(&cache->data_rwsem);
10354         INIT_LIST_HEAD(&cache->list);
10355         INIT_LIST_HEAD(&cache->cluster_list);
10356         INIT_LIST_HEAD(&cache->bg_list);
10357         INIT_LIST_HEAD(&cache->ro_list);
10358         INIT_LIST_HEAD(&cache->dirty_list);
10359         INIT_LIST_HEAD(&cache->io_list);
10360         btrfs_init_free_space_ctl(cache);
10361         atomic_set(&cache->trimming, 0);
10362         mutex_init(&cache->free_space_lock);
10363         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10364
10365         return cache;
10366 }
10367
10368
10369 /*
10370  * Iterate all chunks and verify that each of them has the corresponding block
10371  * group
10372  */
10373 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
10374 {
10375         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
10376         struct extent_map *em;
10377         struct btrfs_block_group_cache *bg;
10378         u64 start = 0;
10379         int ret = 0;
10380
10381         while (1) {
10382                 read_lock(&map_tree->map_tree.lock);
10383                 /*
10384                  * lookup_extent_mapping will return the first extent map
10385                  * intersecting the range, so setting @len to 1 is enough to
10386                  * get the first chunk.
10387                  */
10388                 em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
10389                 read_unlock(&map_tree->map_tree.lock);
10390                 if (!em)
10391                         break;
10392
10393                 bg = btrfs_lookup_block_group(fs_info, em->start);
10394                 if (!bg) {
10395                         btrfs_err(fs_info,
10396         "chunk start=%llu len=%llu doesn't have corresponding block group",
10397                                      em->start, em->len);
10398                         ret = -EUCLEAN;
10399                         free_extent_map(em);
10400                         break;
10401                 }
10402                 if (bg->key.objectid != em->start ||
10403                     bg->key.offset != em->len ||
10404                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
10405                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10406                         btrfs_err(fs_info,
10407 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
10408                                 em->start, em->len,
10409                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
10410                                 bg->key.objectid, bg->key.offset,
10411                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
10412                         ret = -EUCLEAN;
10413                         free_extent_map(em);
10414                         btrfs_put_block_group(bg);
10415                         break;
10416                 }
10417                 start = em->start + em->len;
10418                 free_extent_map(em);
10419                 btrfs_put_block_group(bg);
10420         }
10421         return ret;
10422 }
10423
10424 int btrfs_read_block_groups(struct btrfs_fs_info *info)
10425 {
10426         struct btrfs_path *path;
10427         int ret;
10428         struct btrfs_block_group_cache *cache;
10429         struct btrfs_space_info *space_info;
10430         struct btrfs_key key;
10431         struct btrfs_key found_key;
10432         struct extent_buffer *leaf;
10433         int need_clear = 0;
10434         u64 cache_gen;
10435         u64 feature;
10436         int mixed;
10437
10438         feature = btrfs_super_incompat_flags(info->super_copy);
10439         mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10440
10441         key.objectid = 0;
10442         key.offset = 0;
10443         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10444         path = btrfs_alloc_path();
10445         if (!path)
10446                 return -ENOMEM;
10447         path->reada = READA_FORWARD;
10448
10449         cache_gen = btrfs_super_cache_generation(info->super_copy);
10450         if (btrfs_test_opt(info, SPACE_CACHE) &&
10451             btrfs_super_generation(info->super_copy) != cache_gen)
10452                 need_clear = 1;
10453         if (btrfs_test_opt(info, CLEAR_CACHE))
10454                 need_clear = 1;
10455
10456         while (1) {
10457                 ret = find_first_block_group(info, path, &key);
10458                 if (ret > 0)
10459                         break;
10460                 if (ret != 0)
10461                         goto error;
10462
10463                 leaf = path->nodes[0];
10464                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10465
10466                 cache = btrfs_create_block_group_cache(info, found_key.objectid,
10467                                                        found_key.offset);
10468                 if (!cache) {
10469                         ret = -ENOMEM;
10470                         goto error;
10471                 }
10472
10473                 if (need_clear) {
10474                         /*
10475                          * When we mount with old space cache, we need to
10476                          * set BTRFS_DC_CLEAR and set dirty flag.
10477                          *
10478                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10479                          *    truncate the old free space cache inode and
10480                          *    setup a new one.
10481                          * b) Setting 'dirty flag' makes sure that we flush
10482                          *    the new space cache info onto disk.
10483                          */
10484                         if (btrfs_test_opt(info, SPACE_CACHE))
10485                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
10486                 }
10487
10488                 read_extent_buffer(leaf, &cache->item,
10489                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
10490                                    sizeof(cache->item));
10491                 cache->flags = btrfs_block_group_flags(&cache->item);
10492                 if (!mixed &&
10493                     ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10494                     (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10495                         btrfs_err(info,
10496 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10497                                   cache->key.objectid);
10498                         ret = -EINVAL;
10499                         goto error;
10500                 }
10501
10502                 key.objectid = found_key.objectid + found_key.offset;
10503                 btrfs_release_path(path);
10504
10505                 /*
10506                  * We need to exclude the super stripes now so that the space
10507                  * info has super bytes accounted for, otherwise we'll think
10508                  * we have more space than we actually do.
10509                  */
10510                 ret = exclude_super_stripes(cache);
10511                 if (ret) {
10512                         /*
10513                          * We may have excluded something, so call this just in
10514                          * case.
10515                          */
10516                         free_excluded_extents(cache);
10517                         btrfs_put_block_group(cache);
10518                         goto error;
10519                 }
10520
10521                 /*
10522                  * check for two cases, either we are full, and therefore
10523                  * don't need to bother with the caching work since we won't
10524                  * find any space, or we are empty, and we can just add all
10525                  * the space in and be done with it.  This saves us _a_lot_ of
10526                  * time, particularly in the full case.
10527                  */
10528                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10529                         cache->last_byte_to_unpin = (u64)-1;
10530                         cache->cached = BTRFS_CACHE_FINISHED;
10531                         free_excluded_extents(cache);
10532                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10533                         cache->last_byte_to_unpin = (u64)-1;
10534                         cache->cached = BTRFS_CACHE_FINISHED;
10535                         add_new_free_space(cache, found_key.objectid,
10536                                            found_key.objectid +
10537                                            found_key.offset);
10538                         free_excluded_extents(cache);
10539                 }
10540
10541                 ret = btrfs_add_block_group_cache(info, cache);
10542                 if (ret) {
10543                         btrfs_remove_free_space_cache(cache);
10544                         btrfs_put_block_group(cache);
10545                         goto error;
10546                 }
10547
10548                 trace_btrfs_add_block_group(info, cache, 0);
10549                 update_space_info(info, cache->flags, found_key.offset,
10550                                   btrfs_block_group_used(&cache->item),
10551                                   cache->bytes_super, &space_info);
10552
10553                 cache->space_info = space_info;
10554
10555                 link_block_group(cache);
10556
10557                 set_avail_alloc_bits(info, cache->flags);
10558                 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10559                         inc_block_group_ro(cache, 1);
10560                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10561                         ASSERT(list_empty(&cache->bg_list));
10562                         btrfs_mark_bg_unused(cache);
10563                 }
10564         }
10565
10566         list_for_each_entry_rcu(space_info, &info->space_info, list) {
10567                 if (!(get_alloc_profile(info, space_info->flags) &
10568                       (BTRFS_BLOCK_GROUP_RAID10 |
10569                        BTRFS_BLOCK_GROUP_RAID1 |
10570                        BTRFS_BLOCK_GROUP_RAID5 |
10571                        BTRFS_BLOCK_GROUP_RAID6 |
10572                        BTRFS_BLOCK_GROUP_DUP)))
10573                         continue;
10574                 /*
10575                  * avoid allocating from un-mirrored block group if there are
10576                  * mirrored block groups.
10577                  */
10578                 list_for_each_entry(cache,
10579                                 &space_info->block_groups[BTRFS_RAID_RAID0],
10580                                 list)
10581                         inc_block_group_ro(cache, 1);
10582                 list_for_each_entry(cache,
10583                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
10584                                 list)
10585                         inc_block_group_ro(cache, 1);
10586         }
10587
10588         btrfs_add_raid_kobjects(info);
10589         init_global_block_rsv(info);
10590         ret = check_chunk_block_group_mappings(info);
10591 error:
10592         btrfs_free_path(path);
10593         return ret;
10594 }
10595
10596 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10597 {
10598         struct btrfs_fs_info *fs_info = trans->fs_info;
10599         struct btrfs_block_group_cache *block_group;
10600         struct btrfs_root *extent_root = fs_info->extent_root;
10601         struct btrfs_block_group_item item;
10602         struct btrfs_key key;
10603         int ret = 0;
10604
10605         if (!trans->can_flush_pending_bgs)
10606                 return;
10607
10608         while (!list_empty(&trans->new_bgs)) {
10609                 block_group = list_first_entry(&trans->new_bgs,
10610                                                struct btrfs_block_group_cache,
10611                                                bg_list);
10612                 if (ret)
10613                         goto next;
10614
10615                 spin_lock(&block_group->lock);
10616                 memcpy(&item, &block_group->item, sizeof(item));
10617                 memcpy(&key, &block_group->key, sizeof(key));
10618                 spin_unlock(&block_group->lock);
10619
10620                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10621                                         sizeof(item));
10622                 if (ret)
10623                         btrfs_abort_transaction(trans, ret);
10624                 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10625                 if (ret)
10626                         btrfs_abort_transaction(trans, ret);
10627                 add_block_group_free_space(trans, block_group);
10628                 /* already aborted the transaction if it failed. */
10629 next:
10630                 btrfs_delayed_refs_rsv_release(fs_info, 1);
10631                 list_del_init(&block_group->bg_list);
10632         }
10633         btrfs_trans_release_chunk_metadata(trans);
10634 }
10635
10636 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10637                            u64 type, u64 chunk_offset, u64 size)
10638 {
10639         struct btrfs_fs_info *fs_info = trans->fs_info;
10640         struct btrfs_block_group_cache *cache;
10641         int ret;
10642
10643         btrfs_set_log_full_commit(fs_info, trans);
10644
10645         cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10646         if (!cache)
10647                 return -ENOMEM;
10648
10649         btrfs_set_block_group_used(&cache->item, bytes_used);
10650         btrfs_set_block_group_chunk_objectid(&cache->item,
10651                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10652         btrfs_set_block_group_flags(&cache->item, type);
10653
10654         cache->flags = type;
10655         cache->last_byte_to_unpin = (u64)-1;
10656         cache->cached = BTRFS_CACHE_FINISHED;
10657         cache->needs_free_space = 1;
10658         ret = exclude_super_stripes(cache);
10659         if (ret) {
10660                 /*
10661                  * We may have excluded something, so call this just in
10662                  * case.
10663                  */
10664                 free_excluded_extents(cache);
10665                 btrfs_put_block_group(cache);
10666                 return ret;
10667         }
10668
10669         add_new_free_space(cache, chunk_offset, chunk_offset + size);
10670
10671         free_excluded_extents(cache);
10672
10673 #ifdef CONFIG_BTRFS_DEBUG
10674         if (btrfs_should_fragment_free_space(cache)) {
10675                 u64 new_bytes_used = size - bytes_used;
10676
10677                 bytes_used += new_bytes_used >> 1;
10678                 fragment_free_space(cache);
10679         }
10680 #endif
10681         /*
10682          * Ensure the corresponding space_info object is created and
10683          * assigned to our block group. We want our bg to be added to the rbtree
10684          * with its ->space_info set.
10685          */
10686         cache->space_info = __find_space_info(fs_info, cache->flags);
10687         ASSERT(cache->space_info);
10688
10689         ret = btrfs_add_block_group_cache(fs_info, cache);
10690         if (ret) {
10691                 btrfs_remove_free_space_cache(cache);
10692                 btrfs_put_block_group(cache);
10693                 return ret;
10694         }
10695
10696         /*
10697          * Now that our block group has its ->space_info set and is inserted in
10698          * the rbtree, update the space info's counters.
10699          */
10700         trace_btrfs_add_block_group(fs_info, cache, 1);
10701         update_space_info(fs_info, cache->flags, size, bytes_used,
10702                                 cache->bytes_super, &cache->space_info);
10703         update_global_block_rsv(fs_info);
10704
10705         link_block_group(cache);
10706
10707         list_add_tail(&cache->bg_list, &trans->new_bgs);
10708         trans->delayed_ref_updates++;
10709         btrfs_update_delayed_refs_rsv(trans);
10710
10711         set_avail_alloc_bits(fs_info, type);
10712         return 0;
10713 }
10714
10715 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10716 {
10717         u64 extra_flags = chunk_to_extended(flags) &
10718                                 BTRFS_EXTENDED_PROFILE_MASK;
10719
10720         write_seqlock(&fs_info->profiles_lock);
10721         if (flags & BTRFS_BLOCK_GROUP_DATA)
10722                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10723         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10724                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10725         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10726                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10727         write_sequnlock(&fs_info->profiles_lock);
10728 }
10729
10730 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10731                              u64 group_start, struct extent_map *em)
10732 {
10733         struct btrfs_fs_info *fs_info = trans->fs_info;
10734         struct btrfs_root *root = fs_info->extent_root;
10735         struct btrfs_path *path;
10736         struct btrfs_block_group_cache *block_group;
10737         struct btrfs_free_cluster *cluster;
10738         struct btrfs_root *tree_root = fs_info->tree_root;
10739         struct btrfs_key key;
10740         struct inode *inode;
10741         struct kobject *kobj = NULL;
10742         int ret;
10743         int index;
10744         int factor;
10745         struct btrfs_caching_control *caching_ctl = NULL;
10746         bool remove_em;
10747         bool remove_rsv = false;
10748
10749         block_group = btrfs_lookup_block_group(fs_info, group_start);
10750         BUG_ON(!block_group);
10751         BUG_ON(!block_group->ro);
10752
10753         trace_btrfs_remove_block_group(block_group);
10754         /*
10755          * Free the reserved super bytes from this block group before
10756          * remove it.
10757          */
10758         free_excluded_extents(block_group);
10759         btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10760                                   block_group->key.offset);
10761
10762         memcpy(&key, &block_group->key, sizeof(key));
10763         index = btrfs_bg_flags_to_raid_index(block_group->flags);
10764         factor = btrfs_bg_type_to_factor(block_group->flags);
10765
10766         /* make sure this block group isn't part of an allocation cluster */
10767         cluster = &fs_info->data_alloc_cluster;
10768         spin_lock(&cluster->refill_lock);
10769         btrfs_return_cluster_to_free_space(block_group, cluster);
10770         spin_unlock(&cluster->refill_lock);
10771
10772         /*
10773          * make sure this block group isn't part of a metadata
10774          * allocation cluster
10775          */
10776         cluster = &fs_info->meta_alloc_cluster;
10777         spin_lock(&cluster->refill_lock);
10778         btrfs_return_cluster_to_free_space(block_group, cluster);
10779         spin_unlock(&cluster->refill_lock);
10780
10781         path = btrfs_alloc_path();
10782         if (!path) {
10783                 ret = -ENOMEM;
10784                 goto out;
10785         }
10786
10787         /*
10788          * get the inode first so any iput calls done for the io_list
10789          * aren't the final iput (no unlinks allowed now)
10790          */
10791         inode = lookup_free_space_inode(fs_info, block_group, path);
10792
10793         mutex_lock(&trans->transaction->cache_write_mutex);
10794         /*
10795          * Make sure our free space cache IO is done before removing the
10796          * free space inode
10797          */
10798         spin_lock(&trans->transaction->dirty_bgs_lock);
10799         if (!list_empty(&block_group->io_list)) {
10800                 list_del_init(&block_group->io_list);
10801
10802                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10803
10804                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10805                 btrfs_wait_cache_io(trans, block_group, path);
10806                 btrfs_put_block_group(block_group);
10807                 spin_lock(&trans->transaction->dirty_bgs_lock);
10808         }
10809
10810         if (!list_empty(&block_group->dirty_list)) {
10811                 list_del_init(&block_group->dirty_list);
10812                 remove_rsv = true;
10813                 btrfs_put_block_group(block_group);
10814         }
10815         spin_unlock(&trans->transaction->dirty_bgs_lock);
10816         mutex_unlock(&trans->transaction->cache_write_mutex);
10817
10818         if (!IS_ERR(inode)) {
10819                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10820                 if (ret) {
10821                         btrfs_add_delayed_iput(inode);
10822                         goto out;
10823                 }
10824                 clear_nlink(inode);
10825                 /* One for the block groups ref */
10826                 spin_lock(&block_group->lock);
10827                 if (block_group->iref) {
10828                         block_group->iref = 0;
10829                         block_group->inode = NULL;
10830                         spin_unlock(&block_group->lock);
10831                         iput(inode);
10832                 } else {
10833                         spin_unlock(&block_group->lock);
10834                 }
10835                 /* One for our lookup ref */
10836                 btrfs_add_delayed_iput(inode);
10837         }
10838
10839         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10840         key.offset = block_group->key.objectid;
10841         key.type = 0;
10842
10843         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10844         if (ret < 0)
10845                 goto out;
10846         if (ret > 0)
10847                 btrfs_release_path(path);
10848         if (ret == 0) {
10849                 ret = btrfs_del_item(trans, tree_root, path);
10850                 if (ret)
10851                         goto out;
10852                 btrfs_release_path(path);
10853         }
10854
10855         spin_lock(&fs_info->block_group_cache_lock);
10856         rb_erase(&block_group->cache_node,
10857                  &fs_info->block_group_cache_tree);
10858         RB_CLEAR_NODE(&block_group->cache_node);
10859
10860         if (fs_info->first_logical_byte == block_group->key.objectid)
10861                 fs_info->first_logical_byte = (u64)-1;
10862         spin_unlock(&fs_info->block_group_cache_lock);
10863
10864         down_write(&block_group->space_info->groups_sem);
10865         /*
10866          * we must use list_del_init so people can check to see if they
10867          * are still on the list after taking the semaphore
10868          */
10869         list_del_init(&block_group->list);
10870         if (list_empty(&block_group->space_info->block_groups[index])) {
10871                 kobj = block_group->space_info->block_group_kobjs[index];
10872                 block_group->space_info->block_group_kobjs[index] = NULL;
10873                 clear_avail_alloc_bits(fs_info, block_group->flags);
10874         }
10875         up_write(&block_group->space_info->groups_sem);
10876         if (kobj) {
10877                 kobject_del(kobj);
10878                 kobject_put(kobj);
10879         }
10880
10881         if (block_group->has_caching_ctl)
10882                 caching_ctl = get_caching_control(block_group);
10883         if (block_group->cached == BTRFS_CACHE_STARTED)
10884                 wait_block_group_cache_done(block_group);
10885         if (block_group->has_caching_ctl) {
10886                 down_write(&fs_info->commit_root_sem);
10887                 if (!caching_ctl) {
10888                         struct btrfs_caching_control *ctl;
10889
10890                         list_for_each_entry(ctl,
10891                                     &fs_info->caching_block_groups, list)
10892                                 if (ctl->block_group == block_group) {
10893                                         caching_ctl = ctl;
10894                                         refcount_inc(&caching_ctl->count);
10895                                         break;
10896                                 }
10897                 }
10898                 if (caching_ctl)
10899                         list_del_init(&caching_ctl->list);
10900                 up_write(&fs_info->commit_root_sem);
10901                 if (caching_ctl) {
10902                         /* Once for the caching bgs list and once for us. */
10903                         put_caching_control(caching_ctl);
10904                         put_caching_control(caching_ctl);
10905                 }
10906         }
10907
10908         spin_lock(&trans->transaction->dirty_bgs_lock);
10909         WARN_ON(!list_empty(&block_group->dirty_list));
10910         WARN_ON(!list_empty(&block_group->io_list));
10911         spin_unlock(&trans->transaction->dirty_bgs_lock);
10912
10913         btrfs_remove_free_space_cache(block_group);
10914
10915         spin_lock(&block_group->space_info->lock);
10916         list_del_init(&block_group->ro_list);
10917
10918         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10919                 WARN_ON(block_group->space_info->total_bytes
10920                         < block_group->key.offset);
10921                 WARN_ON(block_group->space_info->bytes_readonly
10922                         < block_group->key.offset);
10923                 WARN_ON(block_group->space_info->disk_total
10924                         < block_group->key.offset * factor);
10925         }
10926         block_group->space_info->total_bytes -= block_group->key.offset;
10927         block_group->space_info->bytes_readonly -= block_group->key.offset;
10928         block_group->space_info->disk_total -= block_group->key.offset * factor;
10929
10930         spin_unlock(&block_group->space_info->lock);
10931
10932         memcpy(&key, &block_group->key, sizeof(key));
10933
10934         mutex_lock(&fs_info->chunk_mutex);
10935         spin_lock(&block_group->lock);
10936         block_group->removed = 1;
10937         /*
10938          * At this point trimming can't start on this block group, because we
10939          * removed the block group from the tree fs_info->block_group_cache_tree
10940          * so no one can't find it anymore and even if someone already got this
10941          * block group before we removed it from the rbtree, they have already
10942          * incremented block_group->trimming - if they didn't, they won't find
10943          * any free space entries because we already removed them all when we
10944          * called btrfs_remove_free_space_cache().
10945          *
10946          * And we must not remove the extent map from the fs_info->mapping_tree
10947          * to prevent the same logical address range and physical device space
10948          * ranges from being reused for a new block group. This is because our
10949          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10950          * completely transactionless, so while it is trimming a range the
10951          * currently running transaction might finish and a new one start,
10952          * allowing for new block groups to be created that can reuse the same
10953          * physical device locations unless we take this special care.
10954          *
10955          * There may also be an implicit trim operation if the file system
10956          * is mounted with -odiscard. The same protections must remain
10957          * in place until the extents have been discarded completely when
10958          * the transaction commit has completed.
10959          */
10960         remove_em = (atomic_read(&block_group->trimming) == 0);
10961         spin_unlock(&block_group->lock);
10962
10963         if (remove_em) {
10964                 struct extent_map_tree *em_tree;
10965
10966                 em_tree = &fs_info->mapping_tree.map_tree;
10967                 write_lock(&em_tree->lock);
10968                 remove_extent_mapping(em_tree, em);
10969                 write_unlock(&em_tree->lock);
10970                 /* once for the tree */
10971                 free_extent_map(em);
10972         }
10973
10974         mutex_unlock(&fs_info->chunk_mutex);
10975
10976         ret = remove_block_group_free_space(trans, block_group);
10977         if (ret)
10978                 goto out;
10979
10980         btrfs_put_block_group(block_group);
10981         btrfs_put_block_group(block_group);
10982
10983         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10984         if (ret > 0)
10985                 ret = -EIO;
10986         if (ret < 0)
10987                 goto out;
10988
10989         ret = btrfs_del_item(trans, root, path);
10990 out:
10991         if (remove_rsv)
10992                 btrfs_delayed_refs_rsv_release(fs_info, 1);
10993         btrfs_free_path(path);
10994         return ret;
10995 }
10996
10997 struct btrfs_trans_handle *
10998 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10999                                      const u64 chunk_offset)
11000 {
11001         struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
11002         struct extent_map *em;
11003         struct map_lookup *map;
11004         unsigned int num_items;
11005
11006         read_lock(&em_tree->lock);
11007         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
11008         read_unlock(&em_tree->lock);
11009         ASSERT(em && em->start == chunk_offset);
11010
11011         /*
11012          * We need to reserve 3 + N units from the metadata space info in order
11013          * to remove a block group (done at btrfs_remove_chunk() and at
11014          * btrfs_remove_block_group()), which are used for:
11015          *
11016          * 1 unit for adding the free space inode's orphan (located in the tree
11017          * of tree roots).
11018          * 1 unit for deleting the block group item (located in the extent
11019          * tree).
11020          * 1 unit for deleting the free space item (located in tree of tree
11021          * roots).
11022          * N units for deleting N device extent items corresponding to each
11023          * stripe (located in the device tree).
11024          *
11025          * In order to remove a block group we also need to reserve units in the
11026          * system space info in order to update the chunk tree (update one or
11027          * more device items and remove one chunk item), but this is done at
11028          * btrfs_remove_chunk() through a call to check_system_chunk().
11029          */
11030         map = em->map_lookup;
11031         num_items = 3 + map->num_stripes;
11032         free_extent_map(em);
11033
11034         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
11035                                                            num_items, 1);
11036 }
11037
11038 /*
11039  * Process the unused_bgs list and remove any that don't have any allocated
11040  * space inside of them.
11041  */
11042 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
11043 {
11044         struct btrfs_block_group_cache *block_group;
11045         struct btrfs_space_info *space_info;
11046         struct btrfs_trans_handle *trans;
11047         int ret = 0;
11048
11049         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
11050                 return;
11051
11052         spin_lock(&fs_info->unused_bgs_lock);
11053         while (!list_empty(&fs_info->unused_bgs)) {
11054                 u64 start, end;
11055                 int trimming;
11056
11057                 block_group = list_first_entry(&fs_info->unused_bgs,
11058                                                struct btrfs_block_group_cache,
11059                                                bg_list);
11060                 list_del_init(&block_group->bg_list);
11061
11062                 space_info = block_group->space_info;
11063
11064                 if (ret || btrfs_mixed_space_info(space_info)) {
11065                         btrfs_put_block_group(block_group);
11066                         continue;
11067                 }
11068                 spin_unlock(&fs_info->unused_bgs_lock);
11069
11070                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
11071
11072                 /* Don't want to race with allocators so take the groups_sem */
11073                 down_write(&space_info->groups_sem);
11074                 spin_lock(&block_group->lock);
11075                 if (block_group->reserved || block_group->pinned ||
11076                     btrfs_block_group_used(&block_group->item) ||
11077                     block_group->ro ||
11078                     list_is_singular(&block_group->list)) {
11079                         /*
11080                          * We want to bail if we made new allocations or have
11081                          * outstanding allocations in this block group.  We do
11082                          * the ro check in case balance is currently acting on
11083                          * this block group.
11084                          */
11085                         trace_btrfs_skip_unused_block_group(block_group);
11086                         spin_unlock(&block_group->lock);
11087                         up_write(&space_info->groups_sem);
11088                         goto next;
11089                 }
11090                 spin_unlock(&block_group->lock);
11091
11092                 /* We don't want to force the issue, only flip if it's ok. */
11093                 ret = inc_block_group_ro(block_group, 0);
11094                 up_write(&space_info->groups_sem);
11095                 if (ret < 0) {
11096                         ret = 0;
11097                         goto next;
11098                 }
11099
11100                 /*
11101                  * Want to do this before we do anything else so we can recover
11102                  * properly if we fail to join the transaction.
11103                  */
11104                 trans = btrfs_start_trans_remove_block_group(fs_info,
11105                                                      block_group->key.objectid);
11106                 if (IS_ERR(trans)) {
11107                         btrfs_dec_block_group_ro(block_group);
11108                         ret = PTR_ERR(trans);
11109                         goto next;
11110                 }
11111
11112                 /*
11113                  * We could have pending pinned extents for this block group,
11114                  * just delete them, we don't care about them anymore.
11115                  */
11116                 start = block_group->key.objectid;
11117                 end = start + block_group->key.offset - 1;
11118                 /*
11119                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
11120                  * btrfs_finish_extent_commit(). If we are at transaction N,
11121                  * another task might be running finish_extent_commit() for the
11122                  * previous transaction N - 1, and have seen a range belonging
11123                  * to the block group in freed_extents[] before we were able to
11124                  * clear the whole block group range from freed_extents[]. This
11125                  * means that task can lookup for the block group after we
11126                  * unpinned it from freed_extents[] and removed it, leading to
11127                  * a BUG_ON() at btrfs_unpin_extent_range().
11128                  */
11129                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
11130                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
11131                                   EXTENT_DIRTY);
11132                 if (ret) {
11133                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11134                         btrfs_dec_block_group_ro(block_group);
11135                         goto end_trans;
11136                 }
11137                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
11138                                   EXTENT_DIRTY);
11139                 if (ret) {
11140                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11141                         btrfs_dec_block_group_ro(block_group);
11142                         goto end_trans;
11143                 }
11144                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11145
11146                 /* Reset pinned so btrfs_put_block_group doesn't complain */
11147                 spin_lock(&space_info->lock);
11148                 spin_lock(&block_group->lock);
11149
11150                 update_bytes_pinned(space_info, -block_group->pinned);
11151                 space_info->bytes_readonly += block_group->pinned;
11152                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
11153                                    -block_group->pinned,
11154                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
11155                 block_group->pinned = 0;
11156
11157                 spin_unlock(&block_group->lock);
11158                 spin_unlock(&space_info->lock);
11159
11160                 /* DISCARD can flip during remount */
11161                 trimming = btrfs_test_opt(fs_info, DISCARD);
11162
11163                 /* Implicit trim during transaction commit. */
11164                 if (trimming)
11165                         btrfs_get_block_group_trimming(block_group);
11166
11167                 /*
11168                  * Btrfs_remove_chunk will abort the transaction if things go
11169                  * horribly wrong.
11170                  */
11171                 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
11172
11173                 if (ret) {
11174                         if (trimming)
11175                                 btrfs_put_block_group_trimming(block_group);
11176                         goto end_trans;
11177                 }
11178
11179                 /*
11180                  * If we're not mounted with -odiscard, we can just forget
11181                  * about this block group. Otherwise we'll need to wait
11182                  * until transaction commit to do the actual discard.
11183                  */
11184                 if (trimming) {
11185                         spin_lock(&fs_info->unused_bgs_lock);
11186                         /*
11187                          * A concurrent scrub might have added us to the list
11188                          * fs_info->unused_bgs, so use a list_move operation
11189                          * to add the block group to the deleted_bgs list.
11190                          */
11191                         list_move(&block_group->bg_list,
11192                                   &trans->transaction->deleted_bgs);
11193                         spin_unlock(&fs_info->unused_bgs_lock);
11194                         btrfs_get_block_group(block_group);
11195                 }
11196 end_trans:
11197                 btrfs_end_transaction(trans);
11198 next:
11199                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
11200                 btrfs_put_block_group(block_group);
11201                 spin_lock(&fs_info->unused_bgs_lock);
11202         }
11203         spin_unlock(&fs_info->unused_bgs_lock);
11204 }
11205
11206 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
11207 {
11208         struct btrfs_super_block *disk_super;
11209         u64 features;
11210         u64 flags;
11211         int mixed = 0;
11212         int ret;
11213
11214         disk_super = fs_info->super_copy;
11215         if (!btrfs_super_root(disk_super))
11216                 return -EINVAL;
11217
11218         features = btrfs_super_incompat_flags(disk_super);
11219         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
11220                 mixed = 1;
11221
11222         flags = BTRFS_BLOCK_GROUP_SYSTEM;
11223         ret = create_space_info(fs_info, flags);
11224         if (ret)
11225                 goto out;
11226
11227         if (mixed) {
11228                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
11229                 ret = create_space_info(fs_info, flags);
11230         } else {
11231                 flags = BTRFS_BLOCK_GROUP_METADATA;
11232                 ret = create_space_info(fs_info, flags);
11233                 if (ret)
11234                         goto out;
11235
11236                 flags = BTRFS_BLOCK_GROUP_DATA;
11237                 ret = create_space_info(fs_info, flags);
11238         }
11239 out:
11240         return ret;
11241 }
11242
11243 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
11244                                    u64 start, u64 end)
11245 {
11246         return unpin_extent_range(fs_info, start, end, false);
11247 }
11248
11249 /*
11250  * It used to be that old block groups would be left around forever.
11251  * Iterating over them would be enough to trim unused space.  Since we
11252  * now automatically remove them, we also need to iterate over unallocated
11253  * space.
11254  *
11255  * We don't want a transaction for this since the discard may take a
11256  * substantial amount of time.  We don't require that a transaction be
11257  * running, but we do need to take a running transaction into account
11258  * to ensure that we're not discarding chunks that were released or
11259  * allocated in the current transaction.
11260  *
11261  * Holding the chunks lock will prevent other threads from allocating
11262  * or releasing chunks, but it won't prevent a running transaction
11263  * from committing and releasing the memory that the pending chunks
11264  * list head uses.  For that, we need to take a reference to the
11265  * transaction and hold the commit root sem.  We only need to hold
11266  * it while performing the free space search since we have already
11267  * held back allocations.
11268  */
11269 static int btrfs_trim_free_extents(struct btrfs_device *device,
11270                                    struct fstrim_range *range, u64 *trimmed)
11271 {
11272         u64 start, len = 0, end = 0;
11273         int ret;
11274
11275         start = max_t(u64, range->start, SZ_1M);
11276         *trimmed = 0;
11277
11278         /* Discard not supported = nothing to do. */
11279         if (!blk_queue_discard(bdev_get_queue(device->bdev)))
11280                 return 0;
11281
11282         /* Not writable = nothing to do. */
11283         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
11284                 return 0;
11285
11286         /* No free space = nothing to do. */
11287         if (device->total_bytes <= device->bytes_used)
11288                 return 0;
11289
11290         ret = 0;
11291
11292         while (1) {
11293                 struct btrfs_fs_info *fs_info = device->fs_info;
11294                 u64 bytes;
11295
11296                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11297                 if (ret)
11298                         break;
11299
11300                 find_first_clear_extent_bit(&device->alloc_state, start,
11301                                             &start, &end,
11302                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
11303                 /*
11304                  * If find_first_clear_extent_bit find a range that spans the
11305                  * end of the device it will set end to -1, in this case it's up
11306                  * to the caller to trim the value to the size of the device.
11307                  */
11308                 end = min(end, device->total_bytes - 1);
11309                 len = end - start + 1;
11310
11311                 /* We didn't find any extents */
11312                 if (!len) {
11313                         mutex_unlock(&fs_info->chunk_mutex);
11314                         ret = 0;
11315                         break;
11316                 }
11317
11318                 /* Keep going until we satisfy minlen or reach end of space */
11319                 if (len < range->minlen) {
11320                         mutex_unlock(&fs_info->chunk_mutex);
11321                         start += len;
11322                         continue;
11323                 }
11324
11325                 /* If we are out of the passed range break */
11326                 if (start > range->start + range->len - 1) {
11327                         mutex_unlock(&fs_info->chunk_mutex);
11328                         break;
11329                 }
11330
11331                 start = max(range->start, start);
11332                 len = min(range->len, len);
11333
11334                 ret = btrfs_issue_discard(device->bdev, start, len,
11335                                           &bytes);
11336                 if (!ret)
11337                         set_extent_bits(&device->alloc_state, start,
11338                                         start + bytes - 1,
11339                                         CHUNK_TRIMMED);
11340                 mutex_unlock(&fs_info->chunk_mutex);
11341
11342                 if (ret)
11343                         break;
11344
11345                 start += len;
11346                 *trimmed += bytes;
11347
11348                 /* We've trimmed enough */
11349                 if (*trimmed >= range->len)
11350                         break;
11351
11352                 if (fatal_signal_pending(current)) {
11353                         ret = -ERESTARTSYS;
11354                         break;
11355                 }
11356
11357                 cond_resched();
11358         }
11359
11360         return ret;
11361 }
11362
11363 /*
11364  * Trim the whole filesystem by:
11365  * 1) trimming the free space in each block group
11366  * 2) trimming the unallocated space on each device
11367  *
11368  * This will also continue trimming even if a block group or device encounters
11369  * an error.  The return value will be the last error, or 0 if nothing bad
11370  * happens.
11371  */
11372 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
11373 {
11374         struct btrfs_block_group_cache *cache = NULL;
11375         struct btrfs_device *device;
11376         struct list_head *devices;
11377         u64 group_trimmed;
11378         u64 start;
11379         u64 end;
11380         u64 trimmed = 0;
11381         u64 bg_failed = 0;
11382         u64 dev_failed = 0;
11383         int bg_ret = 0;
11384         int dev_ret = 0;
11385         int ret = 0;
11386
11387         cache = btrfs_lookup_first_block_group(fs_info, range->start);
11388         for (; cache; cache = next_block_group(fs_info, cache)) {
11389                 if (cache->key.objectid >= (range->start + range->len)) {
11390                         btrfs_put_block_group(cache);
11391                         break;
11392                 }
11393
11394                 start = max(range->start, cache->key.objectid);
11395                 end = min(range->start + range->len,
11396                                 cache->key.objectid + cache->key.offset);
11397
11398                 if (end - start >= range->minlen) {
11399                         if (!block_group_cache_done(cache)) {
11400                                 ret = cache_block_group(cache, 0);
11401                                 if (ret) {
11402                                         bg_failed++;
11403                                         bg_ret = ret;
11404                                         continue;
11405                                 }
11406                                 ret = wait_block_group_cache_done(cache);
11407                                 if (ret) {
11408                                         bg_failed++;
11409                                         bg_ret = ret;
11410                                         continue;
11411                                 }
11412                         }
11413                         ret = btrfs_trim_block_group(cache,
11414                                                      &group_trimmed,
11415                                                      start,
11416                                                      end,
11417                                                      range->minlen);
11418
11419                         trimmed += group_trimmed;
11420                         if (ret) {
11421                                 bg_failed++;
11422                                 bg_ret = ret;
11423                                 continue;
11424                         }
11425                 }
11426         }
11427
11428         if (bg_failed)
11429                 btrfs_warn(fs_info,
11430                         "failed to trim %llu block group(s), last error %d",
11431                         bg_failed, bg_ret);
11432         mutex_lock(&fs_info->fs_devices->device_list_mutex);
11433         devices = &fs_info->fs_devices->devices;
11434         list_for_each_entry(device, devices, dev_list) {
11435                 ret = btrfs_trim_free_extents(device, range, &group_trimmed);
11436                 if (ret) {
11437                         dev_failed++;
11438                         dev_ret = ret;
11439                         break;
11440                 }
11441
11442                 trimmed += group_trimmed;
11443         }
11444         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11445
11446         if (dev_failed)
11447                 btrfs_warn(fs_info,
11448                         "failed to trim %llu device(s), last error %d",
11449                         dev_failed, dev_ret);
11450         range->len = trimmed;
11451         if (bg_ret)
11452                 return bg_ret;
11453         return dev_ret;
11454 }
11455
11456 /*
11457  * btrfs_{start,end}_write_no_snapshotting() are similar to
11458  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11459  * data into the page cache through nocow before the subvolume is snapshoted,
11460  * but flush the data into disk after the snapshot creation, or to prevent
11461  * operations while snapshotting is ongoing and that cause the snapshot to be
11462  * inconsistent (writes followed by expanding truncates for example).
11463  */
11464 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11465 {
11466         percpu_counter_dec(&root->subv_writers->counter);
11467         cond_wake_up(&root->subv_writers->wait);
11468 }
11469
11470 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11471 {
11472         if (atomic_read(&root->will_be_snapshotted))
11473                 return 0;
11474
11475         percpu_counter_inc(&root->subv_writers->counter);
11476         /*
11477          * Make sure counter is updated before we check for snapshot creation.
11478          */
11479         smp_mb();
11480         if (atomic_read(&root->will_be_snapshotted)) {
11481                 btrfs_end_write_no_snapshotting(root);
11482                 return 0;
11483         }
11484         return 1;
11485 }
11486
11487 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11488 {
11489         while (true) {
11490                 int ret;
11491
11492                 ret = btrfs_start_write_no_snapshotting(root);
11493                 if (ret)
11494                         break;
11495                 wait_var_event(&root->will_be_snapshotted,
11496                                !atomic_read(&root->will_be_snapshotted));
11497         }
11498 }
11499
11500 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11501 {
11502         struct btrfs_fs_info *fs_info = bg->fs_info;
11503
11504         spin_lock(&fs_info->unused_bgs_lock);
11505         if (list_empty(&bg->bg_list)) {
11506                 btrfs_get_block_group(bg);
11507                 trace_btrfs_add_unused_block_group(bg);
11508                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11509         }
11510         spin_unlock(&fs_info->unused_bgs_lock);
11511 }