btrfs: extent-tree: Add lockdep assert when updating space info
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "tree-log.h"
20 #include "disk-io.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "locking.h"
25 #include "free-space-cache.h"
26 #include "free-space-tree.h"
27 #include "math.h"
28 #include "sysfs.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31
32 #undef SCRAMBLE_DELAYED_REFS
33
34 /*
35  * control flags for do_chunk_alloc's force field
36  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37  * if we really need one.
38  *
39  * CHUNK_ALLOC_LIMITED means to only try and allocate one
40  * if we have very few chunks already allocated.  This is
41  * used as part of the clustering code to help make sure
42  * we have a good pool of storage to cluster in, without
43  * filling the FS with empty chunks
44  *
45  * CHUNK_ALLOC_FORCE means it must try to allocate one
46  *
47  */
48 enum {
49         CHUNK_ALLOC_NO_FORCE = 0,
50         CHUNK_ALLOC_LIMITED = 1,
51         CHUNK_ALLOC_FORCE = 2,
52 };
53
54 /*
55  * Declare a helper function to detect underflow of various space info members
56  */
57 #define DECLARE_SPACE_INFO_UPDATE(name)                                 \
58 static inline void update_##name(struct btrfs_space_info *sinfo,        \
59                                  s64 bytes)                             \
60 {                                                                       \
61         lockdep_assert_held(&sinfo->lock);                              \
62         if (bytes < 0 && sinfo->name < -bytes) {                        \
63                 WARN_ON(1);                                             \
64                 sinfo->name = 0;                                        \
65                 return;                                                 \
66         }                                                               \
67         sinfo->name += bytes;                                           \
68 }
69
70 DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
71 DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
72
73 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
74                                struct btrfs_delayed_ref_node *node, u64 parent,
75                                u64 root_objectid, u64 owner_objectid,
76                                u64 owner_offset, int refs_to_drop,
77                                struct btrfs_delayed_extent_op *extra_op);
78 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
79                                     struct extent_buffer *leaf,
80                                     struct btrfs_extent_item *ei);
81 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
82                                       u64 parent, u64 root_objectid,
83                                       u64 flags, u64 owner, u64 offset,
84                                       struct btrfs_key *ins, int ref_mod);
85 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
86                                      struct btrfs_delayed_ref_node *node,
87                                      struct btrfs_delayed_extent_op *extent_op);
88 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
89                           int force);
90 static int find_next_key(struct btrfs_path *path, int level,
91                          struct btrfs_key *key);
92 static void dump_space_info(struct btrfs_fs_info *fs_info,
93                             struct btrfs_space_info *info, u64 bytes,
94                             int dump_block_groups);
95 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
96                                u64 num_bytes);
97 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
98                                      struct btrfs_space_info *space_info,
99                                      u64 num_bytes);
100 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
101                                      struct btrfs_space_info *space_info,
102                                      u64 num_bytes);
103
104 static noinline int
105 block_group_cache_done(struct btrfs_block_group_cache *cache)
106 {
107         smp_mb();
108         return cache->cached == BTRFS_CACHE_FINISHED ||
109                 cache->cached == BTRFS_CACHE_ERROR;
110 }
111
112 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
113 {
114         return (cache->flags & bits) == bits;
115 }
116
117 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
118 {
119         atomic_inc(&cache->count);
120 }
121
122 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
123 {
124         if (atomic_dec_and_test(&cache->count)) {
125                 WARN_ON(cache->pinned > 0);
126                 WARN_ON(cache->reserved > 0);
127
128                 /*
129                  * If not empty, someone is still holding mutex of
130                  * full_stripe_lock, which can only be released by caller.
131                  * And it will definitely cause use-after-free when caller
132                  * tries to release full stripe lock.
133                  *
134                  * No better way to resolve, but only to warn.
135                  */
136                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
137                 kfree(cache->free_space_ctl);
138                 kfree(cache);
139         }
140 }
141
142 /*
143  * this adds the block group to the fs_info rb tree for the block group
144  * cache
145  */
146 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
147                                 struct btrfs_block_group_cache *block_group)
148 {
149         struct rb_node **p;
150         struct rb_node *parent = NULL;
151         struct btrfs_block_group_cache *cache;
152
153         spin_lock(&info->block_group_cache_lock);
154         p = &info->block_group_cache_tree.rb_node;
155
156         while (*p) {
157                 parent = *p;
158                 cache = rb_entry(parent, struct btrfs_block_group_cache,
159                                  cache_node);
160                 if (block_group->key.objectid < cache->key.objectid) {
161                         p = &(*p)->rb_left;
162                 } else if (block_group->key.objectid > cache->key.objectid) {
163                         p = &(*p)->rb_right;
164                 } else {
165                         spin_unlock(&info->block_group_cache_lock);
166                         return -EEXIST;
167                 }
168         }
169
170         rb_link_node(&block_group->cache_node, parent, p);
171         rb_insert_color(&block_group->cache_node,
172                         &info->block_group_cache_tree);
173
174         if (info->first_logical_byte > block_group->key.objectid)
175                 info->first_logical_byte = block_group->key.objectid;
176
177         spin_unlock(&info->block_group_cache_lock);
178
179         return 0;
180 }
181
182 /*
183  * This will return the block group at or after bytenr if contains is 0, else
184  * it will return the block group that contains the bytenr
185  */
186 static struct btrfs_block_group_cache *
187 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
188                               int contains)
189 {
190         struct btrfs_block_group_cache *cache, *ret = NULL;
191         struct rb_node *n;
192         u64 end, start;
193
194         spin_lock(&info->block_group_cache_lock);
195         n = info->block_group_cache_tree.rb_node;
196
197         while (n) {
198                 cache = rb_entry(n, struct btrfs_block_group_cache,
199                                  cache_node);
200                 end = cache->key.objectid + cache->key.offset - 1;
201                 start = cache->key.objectid;
202
203                 if (bytenr < start) {
204                         if (!contains && (!ret || start < ret->key.objectid))
205                                 ret = cache;
206                         n = n->rb_left;
207                 } else if (bytenr > start) {
208                         if (contains && bytenr <= end) {
209                                 ret = cache;
210                                 break;
211                         }
212                         n = n->rb_right;
213                 } else {
214                         ret = cache;
215                         break;
216                 }
217         }
218         if (ret) {
219                 btrfs_get_block_group(ret);
220                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
221                         info->first_logical_byte = ret->key.objectid;
222         }
223         spin_unlock(&info->block_group_cache_lock);
224
225         return ret;
226 }
227
228 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
229                                u64 start, u64 num_bytes)
230 {
231         u64 end = start + num_bytes - 1;
232         set_extent_bits(&fs_info->freed_extents[0],
233                         start, end, EXTENT_UPTODATE);
234         set_extent_bits(&fs_info->freed_extents[1],
235                         start, end, EXTENT_UPTODATE);
236         return 0;
237 }
238
239 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
240 {
241         struct btrfs_fs_info *fs_info = cache->fs_info;
242         u64 start, end;
243
244         start = cache->key.objectid;
245         end = start + cache->key.offset - 1;
246
247         clear_extent_bits(&fs_info->freed_extents[0],
248                           start, end, EXTENT_UPTODATE);
249         clear_extent_bits(&fs_info->freed_extents[1],
250                           start, end, EXTENT_UPTODATE);
251 }
252
253 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
254 {
255         struct btrfs_fs_info *fs_info = cache->fs_info;
256         u64 bytenr;
257         u64 *logical;
258         int stripe_len;
259         int i, nr, ret;
260
261         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
262                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
263                 cache->bytes_super += stripe_len;
264                 ret = add_excluded_extent(fs_info, cache->key.objectid,
265                                           stripe_len);
266                 if (ret)
267                         return ret;
268         }
269
270         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
271                 bytenr = btrfs_sb_offset(i);
272                 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
273                                        bytenr, &logical, &nr, &stripe_len);
274                 if (ret)
275                         return ret;
276
277                 while (nr--) {
278                         u64 start, len;
279
280                         if (logical[nr] > cache->key.objectid +
281                             cache->key.offset)
282                                 continue;
283
284                         if (logical[nr] + stripe_len <= cache->key.objectid)
285                                 continue;
286
287                         start = logical[nr];
288                         if (start < cache->key.objectid) {
289                                 start = cache->key.objectid;
290                                 len = (logical[nr] + stripe_len) - start;
291                         } else {
292                                 len = min_t(u64, stripe_len,
293                                             cache->key.objectid +
294                                             cache->key.offset - start);
295                         }
296
297                         cache->bytes_super += len;
298                         ret = add_excluded_extent(fs_info, start, len);
299                         if (ret) {
300                                 kfree(logical);
301                                 return ret;
302                         }
303                 }
304
305                 kfree(logical);
306         }
307         return 0;
308 }
309
310 static struct btrfs_caching_control *
311 get_caching_control(struct btrfs_block_group_cache *cache)
312 {
313         struct btrfs_caching_control *ctl;
314
315         spin_lock(&cache->lock);
316         if (!cache->caching_ctl) {
317                 spin_unlock(&cache->lock);
318                 return NULL;
319         }
320
321         ctl = cache->caching_ctl;
322         refcount_inc(&ctl->count);
323         spin_unlock(&cache->lock);
324         return ctl;
325 }
326
327 static void put_caching_control(struct btrfs_caching_control *ctl)
328 {
329         if (refcount_dec_and_test(&ctl->count))
330                 kfree(ctl);
331 }
332
333 #ifdef CONFIG_BTRFS_DEBUG
334 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
335 {
336         struct btrfs_fs_info *fs_info = block_group->fs_info;
337         u64 start = block_group->key.objectid;
338         u64 len = block_group->key.offset;
339         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
340                 fs_info->nodesize : fs_info->sectorsize;
341         u64 step = chunk << 1;
342
343         while (len > chunk) {
344                 btrfs_remove_free_space(block_group, start, chunk);
345                 start += step;
346                 if (len < step)
347                         len = 0;
348                 else
349                         len -= step;
350         }
351 }
352 #endif
353
354 /*
355  * this is only called by cache_block_group, since we could have freed extents
356  * we need to check the pinned_extents for any extents that can't be used yet
357  * since their free space will be released as soon as the transaction commits.
358  */
359 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
360                        u64 start, u64 end)
361 {
362         struct btrfs_fs_info *info = block_group->fs_info;
363         u64 extent_start, extent_end, size, total_added = 0;
364         int ret;
365
366         while (start < end) {
367                 ret = find_first_extent_bit(info->pinned_extents, start,
368                                             &extent_start, &extent_end,
369                                             EXTENT_DIRTY | EXTENT_UPTODATE,
370                                             NULL);
371                 if (ret)
372                         break;
373
374                 if (extent_start <= start) {
375                         start = extent_end + 1;
376                 } else if (extent_start > start && extent_start < end) {
377                         size = extent_start - start;
378                         total_added += size;
379                         ret = btrfs_add_free_space(block_group, start,
380                                                    size);
381                         BUG_ON(ret); /* -ENOMEM or logic error */
382                         start = extent_end + 1;
383                 } else {
384                         break;
385                 }
386         }
387
388         if (start < end) {
389                 size = end - start;
390                 total_added += size;
391                 ret = btrfs_add_free_space(block_group, start, size);
392                 BUG_ON(ret); /* -ENOMEM or logic error */
393         }
394
395         return total_added;
396 }
397
398 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
399 {
400         struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
401         struct btrfs_fs_info *fs_info = block_group->fs_info;
402         struct btrfs_root *extent_root = fs_info->extent_root;
403         struct btrfs_path *path;
404         struct extent_buffer *leaf;
405         struct btrfs_key key;
406         u64 total_found = 0;
407         u64 last = 0;
408         u32 nritems;
409         int ret;
410         bool wakeup = true;
411
412         path = btrfs_alloc_path();
413         if (!path)
414                 return -ENOMEM;
415
416         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
417
418 #ifdef CONFIG_BTRFS_DEBUG
419         /*
420          * If we're fragmenting we don't want to make anybody think we can
421          * allocate from this block group until we've had a chance to fragment
422          * the free space.
423          */
424         if (btrfs_should_fragment_free_space(block_group))
425                 wakeup = false;
426 #endif
427         /*
428          * We don't want to deadlock with somebody trying to allocate a new
429          * extent for the extent root while also trying to search the extent
430          * root to add free space.  So we skip locking and search the commit
431          * root, since its read-only
432          */
433         path->skip_locking = 1;
434         path->search_commit_root = 1;
435         path->reada = READA_FORWARD;
436
437         key.objectid = last;
438         key.offset = 0;
439         key.type = BTRFS_EXTENT_ITEM_KEY;
440
441 next:
442         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
443         if (ret < 0)
444                 goto out;
445
446         leaf = path->nodes[0];
447         nritems = btrfs_header_nritems(leaf);
448
449         while (1) {
450                 if (btrfs_fs_closing(fs_info) > 1) {
451                         last = (u64)-1;
452                         break;
453                 }
454
455                 if (path->slots[0] < nritems) {
456                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
457                 } else {
458                         ret = find_next_key(path, 0, &key);
459                         if (ret)
460                                 break;
461
462                         if (need_resched() ||
463                             rwsem_is_contended(&fs_info->commit_root_sem)) {
464                                 if (wakeup)
465                                         caching_ctl->progress = last;
466                                 btrfs_release_path(path);
467                                 up_read(&fs_info->commit_root_sem);
468                                 mutex_unlock(&caching_ctl->mutex);
469                                 cond_resched();
470                                 mutex_lock(&caching_ctl->mutex);
471                                 down_read(&fs_info->commit_root_sem);
472                                 goto next;
473                         }
474
475                         ret = btrfs_next_leaf(extent_root, path);
476                         if (ret < 0)
477                                 goto out;
478                         if (ret)
479                                 break;
480                         leaf = path->nodes[0];
481                         nritems = btrfs_header_nritems(leaf);
482                         continue;
483                 }
484
485                 if (key.objectid < last) {
486                         key.objectid = last;
487                         key.offset = 0;
488                         key.type = BTRFS_EXTENT_ITEM_KEY;
489
490                         if (wakeup)
491                                 caching_ctl->progress = last;
492                         btrfs_release_path(path);
493                         goto next;
494                 }
495
496                 if (key.objectid < block_group->key.objectid) {
497                         path->slots[0]++;
498                         continue;
499                 }
500
501                 if (key.objectid >= block_group->key.objectid +
502                     block_group->key.offset)
503                         break;
504
505                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
506                     key.type == BTRFS_METADATA_ITEM_KEY) {
507                         total_found += add_new_free_space(block_group, last,
508                                                           key.objectid);
509                         if (key.type == BTRFS_METADATA_ITEM_KEY)
510                                 last = key.objectid +
511                                         fs_info->nodesize;
512                         else
513                                 last = key.objectid + key.offset;
514
515                         if (total_found > CACHING_CTL_WAKE_UP) {
516                                 total_found = 0;
517                                 if (wakeup)
518                                         wake_up(&caching_ctl->wait);
519                         }
520                 }
521                 path->slots[0]++;
522         }
523         ret = 0;
524
525         total_found += add_new_free_space(block_group, last,
526                                           block_group->key.objectid +
527                                           block_group->key.offset);
528         caching_ctl->progress = (u64)-1;
529
530 out:
531         btrfs_free_path(path);
532         return ret;
533 }
534
535 static noinline void caching_thread(struct btrfs_work *work)
536 {
537         struct btrfs_block_group_cache *block_group;
538         struct btrfs_fs_info *fs_info;
539         struct btrfs_caching_control *caching_ctl;
540         int ret;
541
542         caching_ctl = container_of(work, struct btrfs_caching_control, work);
543         block_group = caching_ctl->block_group;
544         fs_info = block_group->fs_info;
545
546         mutex_lock(&caching_ctl->mutex);
547         down_read(&fs_info->commit_root_sem);
548
549         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
550                 ret = load_free_space_tree(caching_ctl);
551         else
552                 ret = load_extent_tree_free(caching_ctl);
553
554         spin_lock(&block_group->lock);
555         block_group->caching_ctl = NULL;
556         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
557         spin_unlock(&block_group->lock);
558
559 #ifdef CONFIG_BTRFS_DEBUG
560         if (btrfs_should_fragment_free_space(block_group)) {
561                 u64 bytes_used;
562
563                 spin_lock(&block_group->space_info->lock);
564                 spin_lock(&block_group->lock);
565                 bytes_used = block_group->key.offset -
566                         btrfs_block_group_used(&block_group->item);
567                 block_group->space_info->bytes_used += bytes_used >> 1;
568                 spin_unlock(&block_group->lock);
569                 spin_unlock(&block_group->space_info->lock);
570                 fragment_free_space(block_group);
571         }
572 #endif
573
574         caching_ctl->progress = (u64)-1;
575
576         up_read(&fs_info->commit_root_sem);
577         free_excluded_extents(block_group);
578         mutex_unlock(&caching_ctl->mutex);
579
580         wake_up(&caching_ctl->wait);
581
582         put_caching_control(caching_ctl);
583         btrfs_put_block_group(block_group);
584 }
585
586 static int cache_block_group(struct btrfs_block_group_cache *cache,
587                              int load_cache_only)
588 {
589         DEFINE_WAIT(wait);
590         struct btrfs_fs_info *fs_info = cache->fs_info;
591         struct btrfs_caching_control *caching_ctl;
592         int ret = 0;
593
594         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
595         if (!caching_ctl)
596                 return -ENOMEM;
597
598         INIT_LIST_HEAD(&caching_ctl->list);
599         mutex_init(&caching_ctl->mutex);
600         init_waitqueue_head(&caching_ctl->wait);
601         caching_ctl->block_group = cache;
602         caching_ctl->progress = cache->key.objectid;
603         refcount_set(&caching_ctl->count, 1);
604         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
605                         caching_thread, NULL, NULL);
606
607         spin_lock(&cache->lock);
608         /*
609          * This should be a rare occasion, but this could happen I think in the
610          * case where one thread starts to load the space cache info, and then
611          * some other thread starts a transaction commit which tries to do an
612          * allocation while the other thread is still loading the space cache
613          * info.  The previous loop should have kept us from choosing this block
614          * group, but if we've moved to the state where we will wait on caching
615          * block groups we need to first check if we're doing a fast load here,
616          * so we can wait for it to finish, otherwise we could end up allocating
617          * from a block group who's cache gets evicted for one reason or
618          * another.
619          */
620         while (cache->cached == BTRFS_CACHE_FAST) {
621                 struct btrfs_caching_control *ctl;
622
623                 ctl = cache->caching_ctl;
624                 refcount_inc(&ctl->count);
625                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
626                 spin_unlock(&cache->lock);
627
628                 schedule();
629
630                 finish_wait(&ctl->wait, &wait);
631                 put_caching_control(ctl);
632                 spin_lock(&cache->lock);
633         }
634
635         if (cache->cached != BTRFS_CACHE_NO) {
636                 spin_unlock(&cache->lock);
637                 kfree(caching_ctl);
638                 return 0;
639         }
640         WARN_ON(cache->caching_ctl);
641         cache->caching_ctl = caching_ctl;
642         cache->cached = BTRFS_CACHE_FAST;
643         spin_unlock(&cache->lock);
644
645         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
646                 mutex_lock(&caching_ctl->mutex);
647                 ret = load_free_space_cache(cache);
648
649                 spin_lock(&cache->lock);
650                 if (ret == 1) {
651                         cache->caching_ctl = NULL;
652                         cache->cached = BTRFS_CACHE_FINISHED;
653                         cache->last_byte_to_unpin = (u64)-1;
654                         caching_ctl->progress = (u64)-1;
655                 } else {
656                         if (load_cache_only) {
657                                 cache->caching_ctl = NULL;
658                                 cache->cached = BTRFS_CACHE_NO;
659                         } else {
660                                 cache->cached = BTRFS_CACHE_STARTED;
661                                 cache->has_caching_ctl = 1;
662                         }
663                 }
664                 spin_unlock(&cache->lock);
665 #ifdef CONFIG_BTRFS_DEBUG
666                 if (ret == 1 &&
667                     btrfs_should_fragment_free_space(cache)) {
668                         u64 bytes_used;
669
670                         spin_lock(&cache->space_info->lock);
671                         spin_lock(&cache->lock);
672                         bytes_used = cache->key.offset -
673                                 btrfs_block_group_used(&cache->item);
674                         cache->space_info->bytes_used += bytes_used >> 1;
675                         spin_unlock(&cache->lock);
676                         spin_unlock(&cache->space_info->lock);
677                         fragment_free_space(cache);
678                 }
679 #endif
680                 mutex_unlock(&caching_ctl->mutex);
681
682                 wake_up(&caching_ctl->wait);
683                 if (ret == 1) {
684                         put_caching_control(caching_ctl);
685                         free_excluded_extents(cache);
686                         return 0;
687                 }
688         } else {
689                 /*
690                  * We're either using the free space tree or no caching at all.
691                  * Set cached to the appropriate value and wakeup any waiters.
692                  */
693                 spin_lock(&cache->lock);
694                 if (load_cache_only) {
695                         cache->caching_ctl = NULL;
696                         cache->cached = BTRFS_CACHE_NO;
697                 } else {
698                         cache->cached = BTRFS_CACHE_STARTED;
699                         cache->has_caching_ctl = 1;
700                 }
701                 spin_unlock(&cache->lock);
702                 wake_up(&caching_ctl->wait);
703         }
704
705         if (load_cache_only) {
706                 put_caching_control(caching_ctl);
707                 return 0;
708         }
709
710         down_write(&fs_info->commit_root_sem);
711         refcount_inc(&caching_ctl->count);
712         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
713         up_write(&fs_info->commit_root_sem);
714
715         btrfs_get_block_group(cache);
716
717         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
718
719         return ret;
720 }
721
722 /*
723  * return the block group that starts at or after bytenr
724  */
725 static struct btrfs_block_group_cache *
726 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
727 {
728         return block_group_cache_tree_search(info, bytenr, 0);
729 }
730
731 /*
732  * return the block group that contains the given bytenr
733  */
734 struct btrfs_block_group_cache *btrfs_lookup_block_group(
735                                                  struct btrfs_fs_info *info,
736                                                  u64 bytenr)
737 {
738         return block_group_cache_tree_search(info, bytenr, 1);
739 }
740
741 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
742                                                   u64 flags)
743 {
744         struct list_head *head = &info->space_info;
745         struct btrfs_space_info *found;
746
747         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
748
749         rcu_read_lock();
750         list_for_each_entry_rcu(found, head, list) {
751                 if (found->flags & flags) {
752                         rcu_read_unlock();
753                         return found;
754                 }
755         }
756         rcu_read_unlock();
757         return NULL;
758 }
759
760 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
761 {
762         if (ref->type == BTRFS_REF_METADATA) {
763                 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
764                         return BTRFS_BLOCK_GROUP_SYSTEM;
765                 else
766                         return BTRFS_BLOCK_GROUP_METADATA;
767         }
768         return BTRFS_BLOCK_GROUP_DATA;
769 }
770
771 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
772                              struct btrfs_ref *ref)
773 {
774         struct btrfs_space_info *space_info;
775         u64 flags = generic_ref_to_space_flags(ref);
776
777         space_info = __find_space_info(fs_info, flags);
778         ASSERT(space_info);
779         percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
780                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
781 }
782
783 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
784                              struct btrfs_ref *ref)
785 {
786         struct btrfs_space_info *space_info;
787         u64 flags = generic_ref_to_space_flags(ref);
788
789         space_info = __find_space_info(fs_info, flags);
790         ASSERT(space_info);
791         percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
792                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
793 }
794
795 /*
796  * after adding space to the filesystem, we need to clear the full flags
797  * on all the space infos.
798  */
799 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
800 {
801         struct list_head *head = &info->space_info;
802         struct btrfs_space_info *found;
803
804         rcu_read_lock();
805         list_for_each_entry_rcu(found, head, list)
806                 found->full = 0;
807         rcu_read_unlock();
808 }
809
810 /* simple helper to search for an existing data extent at a given offset */
811 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
812 {
813         int ret;
814         struct btrfs_key key;
815         struct btrfs_path *path;
816
817         path = btrfs_alloc_path();
818         if (!path)
819                 return -ENOMEM;
820
821         key.objectid = start;
822         key.offset = len;
823         key.type = BTRFS_EXTENT_ITEM_KEY;
824         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
825         btrfs_free_path(path);
826         return ret;
827 }
828
829 /*
830  * helper function to lookup reference count and flags of a tree block.
831  *
832  * the head node for delayed ref is used to store the sum of all the
833  * reference count modifications queued up in the rbtree. the head
834  * node may also store the extent flags to set. This way you can check
835  * to see what the reference count and extent flags would be if all of
836  * the delayed refs are not processed.
837  */
838 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
839                              struct btrfs_fs_info *fs_info, u64 bytenr,
840                              u64 offset, int metadata, u64 *refs, u64 *flags)
841 {
842         struct btrfs_delayed_ref_head *head;
843         struct btrfs_delayed_ref_root *delayed_refs;
844         struct btrfs_path *path;
845         struct btrfs_extent_item *ei;
846         struct extent_buffer *leaf;
847         struct btrfs_key key;
848         u32 item_size;
849         u64 num_refs;
850         u64 extent_flags;
851         int ret;
852
853         /*
854          * If we don't have skinny metadata, don't bother doing anything
855          * different
856          */
857         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
858                 offset = fs_info->nodesize;
859                 metadata = 0;
860         }
861
862         path = btrfs_alloc_path();
863         if (!path)
864                 return -ENOMEM;
865
866         if (!trans) {
867                 path->skip_locking = 1;
868                 path->search_commit_root = 1;
869         }
870
871 search_again:
872         key.objectid = bytenr;
873         key.offset = offset;
874         if (metadata)
875                 key.type = BTRFS_METADATA_ITEM_KEY;
876         else
877                 key.type = BTRFS_EXTENT_ITEM_KEY;
878
879         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
880         if (ret < 0)
881                 goto out_free;
882
883         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
884                 if (path->slots[0]) {
885                         path->slots[0]--;
886                         btrfs_item_key_to_cpu(path->nodes[0], &key,
887                                               path->slots[0]);
888                         if (key.objectid == bytenr &&
889                             key.type == BTRFS_EXTENT_ITEM_KEY &&
890                             key.offset == fs_info->nodesize)
891                                 ret = 0;
892                 }
893         }
894
895         if (ret == 0) {
896                 leaf = path->nodes[0];
897                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
898                 if (item_size >= sizeof(*ei)) {
899                         ei = btrfs_item_ptr(leaf, path->slots[0],
900                                             struct btrfs_extent_item);
901                         num_refs = btrfs_extent_refs(leaf, ei);
902                         extent_flags = btrfs_extent_flags(leaf, ei);
903                 } else {
904                         ret = -EINVAL;
905                         btrfs_print_v0_err(fs_info);
906                         if (trans)
907                                 btrfs_abort_transaction(trans, ret);
908                         else
909                                 btrfs_handle_fs_error(fs_info, ret, NULL);
910
911                         goto out_free;
912                 }
913
914                 BUG_ON(num_refs == 0);
915         } else {
916                 num_refs = 0;
917                 extent_flags = 0;
918                 ret = 0;
919         }
920
921         if (!trans)
922                 goto out;
923
924         delayed_refs = &trans->transaction->delayed_refs;
925         spin_lock(&delayed_refs->lock);
926         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
927         if (head) {
928                 if (!mutex_trylock(&head->mutex)) {
929                         refcount_inc(&head->refs);
930                         spin_unlock(&delayed_refs->lock);
931
932                         btrfs_release_path(path);
933
934                         /*
935                          * Mutex was contended, block until it's released and try
936                          * again
937                          */
938                         mutex_lock(&head->mutex);
939                         mutex_unlock(&head->mutex);
940                         btrfs_put_delayed_ref_head(head);
941                         goto search_again;
942                 }
943                 spin_lock(&head->lock);
944                 if (head->extent_op && head->extent_op->update_flags)
945                         extent_flags |= head->extent_op->flags_to_set;
946                 else
947                         BUG_ON(num_refs == 0);
948
949                 num_refs += head->ref_mod;
950                 spin_unlock(&head->lock);
951                 mutex_unlock(&head->mutex);
952         }
953         spin_unlock(&delayed_refs->lock);
954 out:
955         WARN_ON(num_refs == 0);
956         if (refs)
957                 *refs = num_refs;
958         if (flags)
959                 *flags = extent_flags;
960 out_free:
961         btrfs_free_path(path);
962         return ret;
963 }
964
965 /*
966  * Back reference rules.  Back refs have three main goals:
967  *
968  * 1) differentiate between all holders of references to an extent so that
969  *    when a reference is dropped we can make sure it was a valid reference
970  *    before freeing the extent.
971  *
972  * 2) Provide enough information to quickly find the holders of an extent
973  *    if we notice a given block is corrupted or bad.
974  *
975  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
976  *    maintenance.  This is actually the same as #2, but with a slightly
977  *    different use case.
978  *
979  * There are two kinds of back refs. The implicit back refs is optimized
980  * for pointers in non-shared tree blocks. For a given pointer in a block,
981  * back refs of this kind provide information about the block's owner tree
982  * and the pointer's key. These information allow us to find the block by
983  * b-tree searching. The full back refs is for pointers in tree blocks not
984  * referenced by their owner trees. The location of tree block is recorded
985  * in the back refs. Actually the full back refs is generic, and can be
986  * used in all cases the implicit back refs is used. The major shortcoming
987  * of the full back refs is its overhead. Every time a tree block gets
988  * COWed, we have to update back refs entry for all pointers in it.
989  *
990  * For a newly allocated tree block, we use implicit back refs for
991  * pointers in it. This means most tree related operations only involve
992  * implicit back refs. For a tree block created in old transaction, the
993  * only way to drop a reference to it is COW it. So we can detect the
994  * event that tree block loses its owner tree's reference and do the
995  * back refs conversion.
996  *
997  * When a tree block is COWed through a tree, there are four cases:
998  *
999  * The reference count of the block is one and the tree is the block's
1000  * owner tree. Nothing to do in this case.
1001  *
1002  * The reference count of the block is one and the tree is not the
1003  * block's owner tree. In this case, full back refs is used for pointers
1004  * in the block. Remove these full back refs, add implicit back refs for
1005  * every pointers in the new block.
1006  *
1007  * The reference count of the block is greater than one and the tree is
1008  * the block's owner tree. In this case, implicit back refs is used for
1009  * pointers in the block. Add full back refs for every pointers in the
1010  * block, increase lower level extents' reference counts. The original
1011  * implicit back refs are entailed to the new block.
1012  *
1013  * The reference count of the block is greater than one and the tree is
1014  * not the block's owner tree. Add implicit back refs for every pointer in
1015  * the new block, increase lower level extents' reference count.
1016  *
1017  * Back Reference Key composing:
1018  *
1019  * The key objectid corresponds to the first byte in the extent,
1020  * The key type is used to differentiate between types of back refs.
1021  * There are different meanings of the key offset for different types
1022  * of back refs.
1023  *
1024  * File extents can be referenced by:
1025  *
1026  * - multiple snapshots, subvolumes, or different generations in one subvol
1027  * - different files inside a single subvolume
1028  * - different offsets inside a file (bookend extents in file.c)
1029  *
1030  * The extent ref structure for the implicit back refs has fields for:
1031  *
1032  * - Objectid of the subvolume root
1033  * - objectid of the file holding the reference
1034  * - original offset in the file
1035  * - how many bookend extents
1036  *
1037  * The key offset for the implicit back refs is hash of the first
1038  * three fields.
1039  *
1040  * The extent ref structure for the full back refs has field for:
1041  *
1042  * - number of pointers in the tree leaf
1043  *
1044  * The key offset for the implicit back refs is the first byte of
1045  * the tree leaf
1046  *
1047  * When a file extent is allocated, The implicit back refs is used.
1048  * the fields are filled in:
1049  *
1050  *     (root_key.objectid, inode objectid, offset in file, 1)
1051  *
1052  * When a file extent is removed file truncation, we find the
1053  * corresponding implicit back refs and check the following fields:
1054  *
1055  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1056  *
1057  * Btree extents can be referenced by:
1058  *
1059  * - Different subvolumes
1060  *
1061  * Both the implicit back refs and the full back refs for tree blocks
1062  * only consist of key. The key offset for the implicit back refs is
1063  * objectid of block's owner tree. The key offset for the full back refs
1064  * is the first byte of parent block.
1065  *
1066  * When implicit back refs is used, information about the lowest key and
1067  * level of the tree block are required. These information are stored in
1068  * tree block info structure.
1069  */
1070
1071 /*
1072  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1073  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
1074  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1075  */
1076 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1077                                      struct btrfs_extent_inline_ref *iref,
1078                                      enum btrfs_inline_ref_type is_data)
1079 {
1080         int type = btrfs_extent_inline_ref_type(eb, iref);
1081         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1082
1083         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1084             type == BTRFS_SHARED_BLOCK_REF_KEY ||
1085             type == BTRFS_SHARED_DATA_REF_KEY ||
1086             type == BTRFS_EXTENT_DATA_REF_KEY) {
1087                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1088                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
1089                                 return type;
1090                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1091                                 ASSERT(eb->fs_info);
1092                                 /*
1093                                  * Every shared one has parent tree
1094                                  * block, which must be aligned to
1095                                  * nodesize.
1096                                  */
1097                                 if (offset &&
1098                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1099                                         return type;
1100                         }
1101                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1102                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
1103                                 return type;
1104                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
1105                                 ASSERT(eb->fs_info);
1106                                 /*
1107                                  * Every shared one has parent tree
1108                                  * block, which must be aligned to
1109                                  * nodesize.
1110                                  */
1111                                 if (offset &&
1112                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1113                                         return type;
1114                         }
1115                 } else {
1116                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1117                         return type;
1118                 }
1119         }
1120
1121         btrfs_print_leaf((struct extent_buffer *)eb);
1122         btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1123                   eb->start, type);
1124         WARN_ON(1);
1125
1126         return BTRFS_REF_TYPE_INVALID;
1127 }
1128
1129 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1130 {
1131         u32 high_crc = ~(u32)0;
1132         u32 low_crc = ~(u32)0;
1133         __le64 lenum;
1134
1135         lenum = cpu_to_le64(root_objectid);
1136         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1137         lenum = cpu_to_le64(owner);
1138         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1139         lenum = cpu_to_le64(offset);
1140         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1141
1142         return ((u64)high_crc << 31) ^ (u64)low_crc;
1143 }
1144
1145 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1146                                      struct btrfs_extent_data_ref *ref)
1147 {
1148         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1149                                     btrfs_extent_data_ref_objectid(leaf, ref),
1150                                     btrfs_extent_data_ref_offset(leaf, ref));
1151 }
1152
1153 static int match_extent_data_ref(struct extent_buffer *leaf,
1154                                  struct btrfs_extent_data_ref *ref,
1155                                  u64 root_objectid, u64 owner, u64 offset)
1156 {
1157         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1158             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1159             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1160                 return 0;
1161         return 1;
1162 }
1163
1164 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1165                                            struct btrfs_path *path,
1166                                            u64 bytenr, u64 parent,
1167                                            u64 root_objectid,
1168                                            u64 owner, u64 offset)
1169 {
1170         struct btrfs_root *root = trans->fs_info->extent_root;
1171         struct btrfs_key key;
1172         struct btrfs_extent_data_ref *ref;
1173         struct extent_buffer *leaf;
1174         u32 nritems;
1175         int ret;
1176         int recow;
1177         int err = -ENOENT;
1178
1179         key.objectid = bytenr;
1180         if (parent) {
1181                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1182                 key.offset = parent;
1183         } else {
1184                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1185                 key.offset = hash_extent_data_ref(root_objectid,
1186                                                   owner, offset);
1187         }
1188 again:
1189         recow = 0;
1190         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1191         if (ret < 0) {
1192                 err = ret;
1193                 goto fail;
1194         }
1195
1196         if (parent) {
1197                 if (!ret)
1198                         return 0;
1199                 goto fail;
1200         }
1201
1202         leaf = path->nodes[0];
1203         nritems = btrfs_header_nritems(leaf);
1204         while (1) {
1205                 if (path->slots[0] >= nritems) {
1206                         ret = btrfs_next_leaf(root, path);
1207                         if (ret < 0)
1208                                 err = ret;
1209                         if (ret)
1210                                 goto fail;
1211
1212                         leaf = path->nodes[0];
1213                         nritems = btrfs_header_nritems(leaf);
1214                         recow = 1;
1215                 }
1216
1217                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1218                 if (key.objectid != bytenr ||
1219                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1220                         goto fail;
1221
1222                 ref = btrfs_item_ptr(leaf, path->slots[0],
1223                                      struct btrfs_extent_data_ref);
1224
1225                 if (match_extent_data_ref(leaf, ref, root_objectid,
1226                                           owner, offset)) {
1227                         if (recow) {
1228                                 btrfs_release_path(path);
1229                                 goto again;
1230                         }
1231                         err = 0;
1232                         break;
1233                 }
1234                 path->slots[0]++;
1235         }
1236 fail:
1237         return err;
1238 }
1239
1240 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1241                                            struct btrfs_path *path,
1242                                            u64 bytenr, u64 parent,
1243                                            u64 root_objectid, u64 owner,
1244                                            u64 offset, int refs_to_add)
1245 {
1246         struct btrfs_root *root = trans->fs_info->extent_root;
1247         struct btrfs_key key;
1248         struct extent_buffer *leaf;
1249         u32 size;
1250         u32 num_refs;
1251         int ret;
1252
1253         key.objectid = bytenr;
1254         if (parent) {
1255                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1256                 key.offset = parent;
1257                 size = sizeof(struct btrfs_shared_data_ref);
1258         } else {
1259                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1260                 key.offset = hash_extent_data_ref(root_objectid,
1261                                                   owner, offset);
1262                 size = sizeof(struct btrfs_extent_data_ref);
1263         }
1264
1265         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1266         if (ret && ret != -EEXIST)
1267                 goto fail;
1268
1269         leaf = path->nodes[0];
1270         if (parent) {
1271                 struct btrfs_shared_data_ref *ref;
1272                 ref = btrfs_item_ptr(leaf, path->slots[0],
1273                                      struct btrfs_shared_data_ref);
1274                 if (ret == 0) {
1275                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1276                 } else {
1277                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1278                         num_refs += refs_to_add;
1279                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1280                 }
1281         } else {
1282                 struct btrfs_extent_data_ref *ref;
1283                 while (ret == -EEXIST) {
1284                         ref = btrfs_item_ptr(leaf, path->slots[0],
1285                                              struct btrfs_extent_data_ref);
1286                         if (match_extent_data_ref(leaf, ref, root_objectid,
1287                                                   owner, offset))
1288                                 break;
1289                         btrfs_release_path(path);
1290                         key.offset++;
1291                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1292                                                       size);
1293                         if (ret && ret != -EEXIST)
1294                                 goto fail;
1295
1296                         leaf = path->nodes[0];
1297                 }
1298                 ref = btrfs_item_ptr(leaf, path->slots[0],
1299                                      struct btrfs_extent_data_ref);
1300                 if (ret == 0) {
1301                         btrfs_set_extent_data_ref_root(leaf, ref,
1302                                                        root_objectid);
1303                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1304                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1305                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1306                 } else {
1307                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1308                         num_refs += refs_to_add;
1309                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1310                 }
1311         }
1312         btrfs_mark_buffer_dirty(leaf);
1313         ret = 0;
1314 fail:
1315         btrfs_release_path(path);
1316         return ret;
1317 }
1318
1319 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1320                                            struct btrfs_path *path,
1321                                            int refs_to_drop, int *last_ref)
1322 {
1323         struct btrfs_key key;
1324         struct btrfs_extent_data_ref *ref1 = NULL;
1325         struct btrfs_shared_data_ref *ref2 = NULL;
1326         struct extent_buffer *leaf;
1327         u32 num_refs = 0;
1328         int ret = 0;
1329
1330         leaf = path->nodes[0];
1331         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1332
1333         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1334                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1335                                       struct btrfs_extent_data_ref);
1336                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1337         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1338                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1339                                       struct btrfs_shared_data_ref);
1340                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1341         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1342                 btrfs_print_v0_err(trans->fs_info);
1343                 btrfs_abort_transaction(trans, -EINVAL);
1344                 return -EINVAL;
1345         } else {
1346                 BUG();
1347         }
1348
1349         BUG_ON(num_refs < refs_to_drop);
1350         num_refs -= refs_to_drop;
1351
1352         if (num_refs == 0) {
1353                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1354                 *last_ref = 1;
1355         } else {
1356                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1357                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1358                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1359                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1360                 btrfs_mark_buffer_dirty(leaf);
1361         }
1362         return ret;
1363 }
1364
1365 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1366                                           struct btrfs_extent_inline_ref *iref)
1367 {
1368         struct btrfs_key key;
1369         struct extent_buffer *leaf;
1370         struct btrfs_extent_data_ref *ref1;
1371         struct btrfs_shared_data_ref *ref2;
1372         u32 num_refs = 0;
1373         int type;
1374
1375         leaf = path->nodes[0];
1376         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1377
1378         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1379         if (iref) {
1380                 /*
1381                  * If type is invalid, we should have bailed out earlier than
1382                  * this call.
1383                  */
1384                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1385                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1386                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1387                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1388                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1389                 } else {
1390                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1391                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1392                 }
1393         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1394                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1395                                       struct btrfs_extent_data_ref);
1396                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1397         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1398                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1399                                       struct btrfs_shared_data_ref);
1400                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1401         } else {
1402                 WARN_ON(1);
1403         }
1404         return num_refs;
1405 }
1406
1407 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1408                                           struct btrfs_path *path,
1409                                           u64 bytenr, u64 parent,
1410                                           u64 root_objectid)
1411 {
1412         struct btrfs_root *root = trans->fs_info->extent_root;
1413         struct btrfs_key key;
1414         int ret;
1415
1416         key.objectid = bytenr;
1417         if (parent) {
1418                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1419                 key.offset = parent;
1420         } else {
1421                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1422                 key.offset = root_objectid;
1423         }
1424
1425         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1426         if (ret > 0)
1427                 ret = -ENOENT;
1428         return ret;
1429 }
1430
1431 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1432                                           struct btrfs_path *path,
1433                                           u64 bytenr, u64 parent,
1434                                           u64 root_objectid)
1435 {
1436         struct btrfs_key key;
1437         int ret;
1438
1439         key.objectid = bytenr;
1440         if (parent) {
1441                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1442                 key.offset = parent;
1443         } else {
1444                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1445                 key.offset = root_objectid;
1446         }
1447
1448         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1449                                       path, &key, 0);
1450         btrfs_release_path(path);
1451         return ret;
1452 }
1453
1454 static inline int extent_ref_type(u64 parent, u64 owner)
1455 {
1456         int type;
1457         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1458                 if (parent > 0)
1459                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1460                 else
1461                         type = BTRFS_TREE_BLOCK_REF_KEY;
1462         } else {
1463                 if (parent > 0)
1464                         type = BTRFS_SHARED_DATA_REF_KEY;
1465                 else
1466                         type = BTRFS_EXTENT_DATA_REF_KEY;
1467         }
1468         return type;
1469 }
1470
1471 static int find_next_key(struct btrfs_path *path, int level,
1472                          struct btrfs_key *key)
1473
1474 {
1475         for (; level < BTRFS_MAX_LEVEL; level++) {
1476                 if (!path->nodes[level])
1477                         break;
1478                 if (path->slots[level] + 1 >=
1479                     btrfs_header_nritems(path->nodes[level]))
1480                         continue;
1481                 if (level == 0)
1482                         btrfs_item_key_to_cpu(path->nodes[level], key,
1483                                               path->slots[level] + 1);
1484                 else
1485                         btrfs_node_key_to_cpu(path->nodes[level], key,
1486                                               path->slots[level] + 1);
1487                 return 0;
1488         }
1489         return 1;
1490 }
1491
1492 /*
1493  * look for inline back ref. if back ref is found, *ref_ret is set
1494  * to the address of inline back ref, and 0 is returned.
1495  *
1496  * if back ref isn't found, *ref_ret is set to the address where it
1497  * should be inserted, and -ENOENT is returned.
1498  *
1499  * if insert is true and there are too many inline back refs, the path
1500  * points to the extent item, and -EAGAIN is returned.
1501  *
1502  * NOTE: inline back refs are ordered in the same way that back ref
1503  *       items in the tree are ordered.
1504  */
1505 static noinline_for_stack
1506 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1507                                  struct btrfs_path *path,
1508                                  struct btrfs_extent_inline_ref **ref_ret,
1509                                  u64 bytenr, u64 num_bytes,
1510                                  u64 parent, u64 root_objectid,
1511                                  u64 owner, u64 offset, int insert)
1512 {
1513         struct btrfs_fs_info *fs_info = trans->fs_info;
1514         struct btrfs_root *root = fs_info->extent_root;
1515         struct btrfs_key key;
1516         struct extent_buffer *leaf;
1517         struct btrfs_extent_item *ei;
1518         struct btrfs_extent_inline_ref *iref;
1519         u64 flags;
1520         u64 item_size;
1521         unsigned long ptr;
1522         unsigned long end;
1523         int extra_size;
1524         int type;
1525         int want;
1526         int ret;
1527         int err = 0;
1528         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1529         int needed;
1530
1531         key.objectid = bytenr;
1532         key.type = BTRFS_EXTENT_ITEM_KEY;
1533         key.offset = num_bytes;
1534
1535         want = extent_ref_type(parent, owner);
1536         if (insert) {
1537                 extra_size = btrfs_extent_inline_ref_size(want);
1538                 path->keep_locks = 1;
1539         } else
1540                 extra_size = -1;
1541
1542         /*
1543          * Owner is our level, so we can just add one to get the level for the
1544          * block we are interested in.
1545          */
1546         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1547                 key.type = BTRFS_METADATA_ITEM_KEY;
1548                 key.offset = owner;
1549         }
1550
1551 again:
1552         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1553         if (ret < 0) {
1554                 err = ret;
1555                 goto out;
1556         }
1557
1558         /*
1559          * We may be a newly converted file system which still has the old fat
1560          * extent entries for metadata, so try and see if we have one of those.
1561          */
1562         if (ret > 0 && skinny_metadata) {
1563                 skinny_metadata = false;
1564                 if (path->slots[0]) {
1565                         path->slots[0]--;
1566                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1567                                               path->slots[0]);
1568                         if (key.objectid == bytenr &&
1569                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1570                             key.offset == num_bytes)
1571                                 ret = 0;
1572                 }
1573                 if (ret) {
1574                         key.objectid = bytenr;
1575                         key.type = BTRFS_EXTENT_ITEM_KEY;
1576                         key.offset = num_bytes;
1577                         btrfs_release_path(path);
1578                         goto again;
1579                 }
1580         }
1581
1582         if (ret && !insert) {
1583                 err = -ENOENT;
1584                 goto out;
1585         } else if (WARN_ON(ret)) {
1586                 err = -EIO;
1587                 goto out;
1588         }
1589
1590         leaf = path->nodes[0];
1591         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1592         if (unlikely(item_size < sizeof(*ei))) {
1593                 err = -EINVAL;
1594                 btrfs_print_v0_err(fs_info);
1595                 btrfs_abort_transaction(trans, err);
1596                 goto out;
1597         }
1598
1599         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1600         flags = btrfs_extent_flags(leaf, ei);
1601
1602         ptr = (unsigned long)(ei + 1);
1603         end = (unsigned long)ei + item_size;
1604
1605         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1606                 ptr += sizeof(struct btrfs_tree_block_info);
1607                 BUG_ON(ptr > end);
1608         }
1609
1610         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1611                 needed = BTRFS_REF_TYPE_DATA;
1612         else
1613                 needed = BTRFS_REF_TYPE_BLOCK;
1614
1615         err = -ENOENT;
1616         while (1) {
1617                 if (ptr >= end) {
1618                         WARN_ON(ptr > end);
1619                         break;
1620                 }
1621                 iref = (struct btrfs_extent_inline_ref *)ptr;
1622                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1623                 if (type == BTRFS_REF_TYPE_INVALID) {
1624                         err = -EUCLEAN;
1625                         goto out;
1626                 }
1627
1628                 if (want < type)
1629                         break;
1630                 if (want > type) {
1631                         ptr += btrfs_extent_inline_ref_size(type);
1632                         continue;
1633                 }
1634
1635                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1636                         struct btrfs_extent_data_ref *dref;
1637                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1638                         if (match_extent_data_ref(leaf, dref, root_objectid,
1639                                                   owner, offset)) {
1640                                 err = 0;
1641                                 break;
1642                         }
1643                         if (hash_extent_data_ref_item(leaf, dref) <
1644                             hash_extent_data_ref(root_objectid, owner, offset))
1645                                 break;
1646                 } else {
1647                         u64 ref_offset;
1648                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1649                         if (parent > 0) {
1650                                 if (parent == ref_offset) {
1651                                         err = 0;
1652                                         break;
1653                                 }
1654                                 if (ref_offset < parent)
1655                                         break;
1656                         } else {
1657                                 if (root_objectid == ref_offset) {
1658                                         err = 0;
1659                                         break;
1660                                 }
1661                                 if (ref_offset < root_objectid)
1662                                         break;
1663                         }
1664                 }
1665                 ptr += btrfs_extent_inline_ref_size(type);
1666         }
1667         if (err == -ENOENT && insert) {
1668                 if (item_size + extra_size >=
1669                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1670                         err = -EAGAIN;
1671                         goto out;
1672                 }
1673                 /*
1674                  * To add new inline back ref, we have to make sure
1675                  * there is no corresponding back ref item.
1676                  * For simplicity, we just do not add new inline back
1677                  * ref if there is any kind of item for this block
1678                  */
1679                 if (find_next_key(path, 0, &key) == 0 &&
1680                     key.objectid == bytenr &&
1681                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1682                         err = -EAGAIN;
1683                         goto out;
1684                 }
1685         }
1686         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1687 out:
1688         if (insert) {
1689                 path->keep_locks = 0;
1690                 btrfs_unlock_up_safe(path, 1);
1691         }
1692         return err;
1693 }
1694
1695 /*
1696  * helper to add new inline back ref
1697  */
1698 static noinline_for_stack
1699 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1700                                  struct btrfs_path *path,
1701                                  struct btrfs_extent_inline_ref *iref,
1702                                  u64 parent, u64 root_objectid,
1703                                  u64 owner, u64 offset, int refs_to_add,
1704                                  struct btrfs_delayed_extent_op *extent_op)
1705 {
1706         struct extent_buffer *leaf;
1707         struct btrfs_extent_item *ei;
1708         unsigned long ptr;
1709         unsigned long end;
1710         unsigned long item_offset;
1711         u64 refs;
1712         int size;
1713         int type;
1714
1715         leaf = path->nodes[0];
1716         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1717         item_offset = (unsigned long)iref - (unsigned long)ei;
1718
1719         type = extent_ref_type(parent, owner);
1720         size = btrfs_extent_inline_ref_size(type);
1721
1722         btrfs_extend_item(path, size);
1723
1724         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1725         refs = btrfs_extent_refs(leaf, ei);
1726         refs += refs_to_add;
1727         btrfs_set_extent_refs(leaf, ei, refs);
1728         if (extent_op)
1729                 __run_delayed_extent_op(extent_op, leaf, ei);
1730
1731         ptr = (unsigned long)ei + item_offset;
1732         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1733         if (ptr < end - size)
1734                 memmove_extent_buffer(leaf, ptr + size, ptr,
1735                                       end - size - ptr);
1736
1737         iref = (struct btrfs_extent_inline_ref *)ptr;
1738         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1739         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1740                 struct btrfs_extent_data_ref *dref;
1741                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1742                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1743                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1744                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1745                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1746         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1747                 struct btrfs_shared_data_ref *sref;
1748                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1749                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1750                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1751         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1752                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1753         } else {
1754                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1755         }
1756         btrfs_mark_buffer_dirty(leaf);
1757 }
1758
1759 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1760                                  struct btrfs_path *path,
1761                                  struct btrfs_extent_inline_ref **ref_ret,
1762                                  u64 bytenr, u64 num_bytes, u64 parent,
1763                                  u64 root_objectid, u64 owner, u64 offset)
1764 {
1765         int ret;
1766
1767         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1768                                            num_bytes, parent, root_objectid,
1769                                            owner, offset, 0);
1770         if (ret != -ENOENT)
1771                 return ret;
1772
1773         btrfs_release_path(path);
1774         *ref_ret = NULL;
1775
1776         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1777                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1778                                             root_objectid);
1779         } else {
1780                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1781                                              root_objectid, owner, offset);
1782         }
1783         return ret;
1784 }
1785
1786 /*
1787  * helper to update/remove inline back ref
1788  */
1789 static noinline_for_stack
1790 void update_inline_extent_backref(struct btrfs_path *path,
1791                                   struct btrfs_extent_inline_ref *iref,
1792                                   int refs_to_mod,
1793                                   struct btrfs_delayed_extent_op *extent_op,
1794                                   int *last_ref)
1795 {
1796         struct extent_buffer *leaf = path->nodes[0];
1797         struct btrfs_extent_item *ei;
1798         struct btrfs_extent_data_ref *dref = NULL;
1799         struct btrfs_shared_data_ref *sref = NULL;
1800         unsigned long ptr;
1801         unsigned long end;
1802         u32 item_size;
1803         int size;
1804         int type;
1805         u64 refs;
1806
1807         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1808         refs = btrfs_extent_refs(leaf, ei);
1809         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1810         refs += refs_to_mod;
1811         btrfs_set_extent_refs(leaf, ei, refs);
1812         if (extent_op)
1813                 __run_delayed_extent_op(extent_op, leaf, ei);
1814
1815         /*
1816          * If type is invalid, we should have bailed out after
1817          * lookup_inline_extent_backref().
1818          */
1819         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1820         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1821
1822         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1823                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1824                 refs = btrfs_extent_data_ref_count(leaf, dref);
1825         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1826                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1827                 refs = btrfs_shared_data_ref_count(leaf, sref);
1828         } else {
1829                 refs = 1;
1830                 BUG_ON(refs_to_mod != -1);
1831         }
1832
1833         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1834         refs += refs_to_mod;
1835
1836         if (refs > 0) {
1837                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1838                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1839                 else
1840                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1841         } else {
1842                 *last_ref = 1;
1843                 size =  btrfs_extent_inline_ref_size(type);
1844                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1845                 ptr = (unsigned long)iref;
1846                 end = (unsigned long)ei + item_size;
1847                 if (ptr + size < end)
1848                         memmove_extent_buffer(leaf, ptr, ptr + size,
1849                                               end - ptr - size);
1850                 item_size -= size;
1851                 btrfs_truncate_item(path, item_size, 1);
1852         }
1853         btrfs_mark_buffer_dirty(leaf);
1854 }
1855
1856 static noinline_for_stack
1857 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1858                                  struct btrfs_path *path,
1859                                  u64 bytenr, u64 num_bytes, u64 parent,
1860                                  u64 root_objectid, u64 owner,
1861                                  u64 offset, int refs_to_add,
1862                                  struct btrfs_delayed_extent_op *extent_op)
1863 {
1864         struct btrfs_extent_inline_ref *iref;
1865         int ret;
1866
1867         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1868                                            num_bytes, parent, root_objectid,
1869                                            owner, offset, 1);
1870         if (ret == 0) {
1871                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1872                 update_inline_extent_backref(path, iref, refs_to_add,
1873                                              extent_op, NULL);
1874         } else if (ret == -ENOENT) {
1875                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1876                                             root_objectid, owner, offset,
1877                                             refs_to_add, extent_op);
1878                 ret = 0;
1879         }
1880         return ret;
1881 }
1882
1883 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1884                                  struct btrfs_path *path,
1885                                  u64 bytenr, u64 parent, u64 root_objectid,
1886                                  u64 owner, u64 offset, int refs_to_add)
1887 {
1888         int ret;
1889         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1890                 BUG_ON(refs_to_add != 1);
1891                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1892                                             root_objectid);
1893         } else {
1894                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1895                                              root_objectid, owner, offset,
1896                                              refs_to_add);
1897         }
1898         return ret;
1899 }
1900
1901 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1902                                  struct btrfs_path *path,
1903                                  struct btrfs_extent_inline_ref *iref,
1904                                  int refs_to_drop, int is_data, int *last_ref)
1905 {
1906         int ret = 0;
1907
1908         BUG_ON(!is_data && refs_to_drop != 1);
1909         if (iref) {
1910                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1911                                              last_ref);
1912         } else if (is_data) {
1913                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1914                                              last_ref);
1915         } else {
1916                 *last_ref = 1;
1917                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1918         }
1919         return ret;
1920 }
1921
1922 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1923                                u64 *discarded_bytes)
1924 {
1925         int j, ret = 0;
1926         u64 bytes_left, end;
1927         u64 aligned_start = ALIGN(start, 1 << 9);
1928
1929         if (WARN_ON(start != aligned_start)) {
1930                 len -= aligned_start - start;
1931                 len = round_down(len, 1 << 9);
1932                 start = aligned_start;
1933         }
1934
1935         *discarded_bytes = 0;
1936
1937         if (!len)
1938                 return 0;
1939
1940         end = start + len;
1941         bytes_left = len;
1942
1943         /* Skip any superblocks on this device. */
1944         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1945                 u64 sb_start = btrfs_sb_offset(j);
1946                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1947                 u64 size = sb_start - start;
1948
1949                 if (!in_range(sb_start, start, bytes_left) &&
1950                     !in_range(sb_end, start, bytes_left) &&
1951                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1952                         continue;
1953
1954                 /*
1955                  * Superblock spans beginning of range.  Adjust start and
1956                  * try again.
1957                  */
1958                 if (sb_start <= start) {
1959                         start += sb_end - start;
1960                         if (start > end) {
1961                                 bytes_left = 0;
1962                                 break;
1963                         }
1964                         bytes_left = end - start;
1965                         continue;
1966                 }
1967
1968                 if (size) {
1969                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1970                                                    GFP_NOFS, 0);
1971                         if (!ret)
1972                                 *discarded_bytes += size;
1973                         else if (ret != -EOPNOTSUPP)
1974                                 return ret;
1975                 }
1976
1977                 start = sb_end;
1978                 if (start > end) {
1979                         bytes_left = 0;
1980                         break;
1981                 }
1982                 bytes_left = end - start;
1983         }
1984
1985         if (bytes_left) {
1986                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1987                                            GFP_NOFS, 0);
1988                 if (!ret)
1989                         *discarded_bytes += bytes_left;
1990         }
1991         return ret;
1992 }
1993
1994 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1995                          u64 num_bytes, u64 *actual_bytes)
1996 {
1997         int ret;
1998         u64 discarded_bytes = 0;
1999         struct btrfs_bio *bbio = NULL;
2000
2001
2002         /*
2003          * Avoid races with device replace and make sure our bbio has devices
2004          * associated to its stripes that don't go away while we are discarding.
2005          */
2006         btrfs_bio_counter_inc_blocked(fs_info);
2007         /* Tell the block device(s) that the sectors can be discarded */
2008         ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
2009                               &bbio, 0);
2010         /* Error condition is -ENOMEM */
2011         if (!ret) {
2012                 struct btrfs_bio_stripe *stripe = bbio->stripes;
2013                 int i;
2014
2015
2016                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2017                         u64 bytes;
2018                         struct request_queue *req_q;
2019
2020                         if (!stripe->dev->bdev) {
2021                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
2022                                 continue;
2023                         }
2024                         req_q = bdev_get_queue(stripe->dev->bdev);
2025                         if (!blk_queue_discard(req_q))
2026                                 continue;
2027
2028                         ret = btrfs_issue_discard(stripe->dev->bdev,
2029                                                   stripe->physical,
2030                                                   stripe->length,
2031                                                   &bytes);
2032                         if (!ret)
2033                                 discarded_bytes += bytes;
2034                         else if (ret != -EOPNOTSUPP)
2035                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2036
2037                         /*
2038                          * Just in case we get back EOPNOTSUPP for some reason,
2039                          * just ignore the return value so we don't screw up
2040                          * people calling discard_extent.
2041                          */
2042                         ret = 0;
2043                 }
2044                 btrfs_put_bbio(bbio);
2045         }
2046         btrfs_bio_counter_dec(fs_info);
2047
2048         if (actual_bytes)
2049                 *actual_bytes = discarded_bytes;
2050
2051
2052         if (ret == -EOPNOTSUPP)
2053                 ret = 0;
2054         return ret;
2055 }
2056
2057 /* Can return -ENOMEM */
2058 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2059                          struct btrfs_ref *generic_ref)
2060 {
2061         struct btrfs_fs_info *fs_info = trans->fs_info;
2062         int old_ref_mod, new_ref_mod;
2063         int ret;
2064
2065         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
2066                generic_ref->action);
2067         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
2068                generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
2069
2070         if (generic_ref->type == BTRFS_REF_METADATA)
2071                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
2072                                 NULL, &old_ref_mod, &new_ref_mod);
2073         else
2074                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
2075                                                  &old_ref_mod, &new_ref_mod);
2076
2077         btrfs_ref_tree_mod(fs_info, generic_ref);
2078
2079         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2080                 sub_pinned_bytes(fs_info, generic_ref);
2081
2082         return ret;
2083 }
2084
2085 /*
2086  * __btrfs_inc_extent_ref - insert backreference for a given extent
2087  *
2088  * @trans:          Handle of transaction
2089  *
2090  * @node:           The delayed ref node used to get the bytenr/length for
2091  *                  extent whose references are incremented.
2092  *
2093  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2094  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2095  *                  bytenr of the parent block. Since new extents are always
2096  *                  created with indirect references, this will only be the case
2097  *                  when relocating a shared extent. In that case, root_objectid
2098  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2099  *                  be 0
2100  *
2101  * @root_objectid:  The id of the root where this modification has originated,
2102  *                  this can be either one of the well-known metadata trees or
2103  *                  the subvolume id which references this extent.
2104  *
2105  * @owner:          For data extents it is the inode number of the owning file.
2106  *                  For metadata extents this parameter holds the level in the
2107  *                  tree of the extent.
2108  *
2109  * @offset:         For metadata extents the offset is ignored and is currently
2110  *                  always passed as 0. For data extents it is the fileoffset
2111  *                  this extent belongs to.
2112  *
2113  * @refs_to_add     Number of references to add
2114  *
2115  * @extent_op       Pointer to a structure, holding information necessary when
2116  *                  updating a tree block's flags
2117  *
2118  */
2119 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2120                                   struct btrfs_delayed_ref_node *node,
2121                                   u64 parent, u64 root_objectid,
2122                                   u64 owner, u64 offset, int refs_to_add,
2123                                   struct btrfs_delayed_extent_op *extent_op)
2124 {
2125         struct btrfs_path *path;
2126         struct extent_buffer *leaf;
2127         struct btrfs_extent_item *item;
2128         struct btrfs_key key;
2129         u64 bytenr = node->bytenr;
2130         u64 num_bytes = node->num_bytes;
2131         u64 refs;
2132         int ret;
2133
2134         path = btrfs_alloc_path();
2135         if (!path)
2136                 return -ENOMEM;
2137
2138         path->reada = READA_FORWARD;
2139         path->leave_spinning = 1;
2140         /* this will setup the path even if it fails to insert the back ref */
2141         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2142                                            parent, root_objectid, owner,
2143                                            offset, refs_to_add, extent_op);
2144         if ((ret < 0 && ret != -EAGAIN) || !ret)
2145                 goto out;
2146
2147         /*
2148          * Ok we had -EAGAIN which means we didn't have space to insert and
2149          * inline extent ref, so just update the reference count and add a
2150          * normal backref.
2151          */
2152         leaf = path->nodes[0];
2153         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2154         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2155         refs = btrfs_extent_refs(leaf, item);
2156         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2157         if (extent_op)
2158                 __run_delayed_extent_op(extent_op, leaf, item);
2159
2160         btrfs_mark_buffer_dirty(leaf);
2161         btrfs_release_path(path);
2162
2163         path->reada = READA_FORWARD;
2164         path->leave_spinning = 1;
2165         /* now insert the actual backref */
2166         ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2167                                     owner, offset, refs_to_add);
2168         if (ret)
2169                 btrfs_abort_transaction(trans, ret);
2170 out:
2171         btrfs_free_path(path);
2172         return ret;
2173 }
2174
2175 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2176                                 struct btrfs_delayed_ref_node *node,
2177                                 struct btrfs_delayed_extent_op *extent_op,
2178                                 int insert_reserved)
2179 {
2180         int ret = 0;
2181         struct btrfs_delayed_data_ref *ref;
2182         struct btrfs_key ins;
2183         u64 parent = 0;
2184         u64 ref_root = 0;
2185         u64 flags = 0;
2186
2187         ins.objectid = node->bytenr;
2188         ins.offset = node->num_bytes;
2189         ins.type = BTRFS_EXTENT_ITEM_KEY;
2190
2191         ref = btrfs_delayed_node_to_data_ref(node);
2192         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2193
2194         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2195                 parent = ref->parent;
2196         ref_root = ref->root;
2197
2198         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2199                 if (extent_op)
2200                         flags |= extent_op->flags_to_set;
2201                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2202                                                  flags, ref->objectid,
2203                                                  ref->offset, &ins,
2204                                                  node->ref_mod);
2205         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2206                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2207                                              ref->objectid, ref->offset,
2208                                              node->ref_mod, extent_op);
2209         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2210                 ret = __btrfs_free_extent(trans, node, parent,
2211                                           ref_root, ref->objectid,
2212                                           ref->offset, node->ref_mod,
2213                                           extent_op);
2214         } else {
2215                 BUG();
2216         }
2217         return ret;
2218 }
2219
2220 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2221                                     struct extent_buffer *leaf,
2222                                     struct btrfs_extent_item *ei)
2223 {
2224         u64 flags = btrfs_extent_flags(leaf, ei);
2225         if (extent_op->update_flags) {
2226                 flags |= extent_op->flags_to_set;
2227                 btrfs_set_extent_flags(leaf, ei, flags);
2228         }
2229
2230         if (extent_op->update_key) {
2231                 struct btrfs_tree_block_info *bi;
2232                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2233                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2234                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2235         }
2236 }
2237
2238 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2239                                  struct btrfs_delayed_ref_head *head,
2240                                  struct btrfs_delayed_extent_op *extent_op)
2241 {
2242         struct btrfs_fs_info *fs_info = trans->fs_info;
2243         struct btrfs_key key;
2244         struct btrfs_path *path;
2245         struct btrfs_extent_item *ei;
2246         struct extent_buffer *leaf;
2247         u32 item_size;
2248         int ret;
2249         int err = 0;
2250         int metadata = !extent_op->is_data;
2251
2252         if (trans->aborted)
2253                 return 0;
2254
2255         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2256                 metadata = 0;
2257
2258         path = btrfs_alloc_path();
2259         if (!path)
2260                 return -ENOMEM;
2261
2262         key.objectid = head->bytenr;
2263
2264         if (metadata) {
2265                 key.type = BTRFS_METADATA_ITEM_KEY;
2266                 key.offset = extent_op->level;
2267         } else {
2268                 key.type = BTRFS_EXTENT_ITEM_KEY;
2269                 key.offset = head->num_bytes;
2270         }
2271
2272 again:
2273         path->reada = READA_FORWARD;
2274         path->leave_spinning = 1;
2275         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2276         if (ret < 0) {
2277                 err = ret;
2278                 goto out;
2279         }
2280         if (ret > 0) {
2281                 if (metadata) {
2282                         if (path->slots[0] > 0) {
2283                                 path->slots[0]--;
2284                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2285                                                       path->slots[0]);
2286                                 if (key.objectid == head->bytenr &&
2287                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2288                                     key.offset == head->num_bytes)
2289                                         ret = 0;
2290                         }
2291                         if (ret > 0) {
2292                                 btrfs_release_path(path);
2293                                 metadata = 0;
2294
2295                                 key.objectid = head->bytenr;
2296                                 key.offset = head->num_bytes;
2297                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2298                                 goto again;
2299                         }
2300                 } else {
2301                         err = -EIO;
2302                         goto out;
2303                 }
2304         }
2305
2306         leaf = path->nodes[0];
2307         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2308
2309         if (unlikely(item_size < sizeof(*ei))) {
2310                 err = -EINVAL;
2311                 btrfs_print_v0_err(fs_info);
2312                 btrfs_abort_transaction(trans, err);
2313                 goto out;
2314         }
2315
2316         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2317         __run_delayed_extent_op(extent_op, leaf, ei);
2318
2319         btrfs_mark_buffer_dirty(leaf);
2320 out:
2321         btrfs_free_path(path);
2322         return err;
2323 }
2324
2325 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2326                                 struct btrfs_delayed_ref_node *node,
2327                                 struct btrfs_delayed_extent_op *extent_op,
2328                                 int insert_reserved)
2329 {
2330         int ret = 0;
2331         struct btrfs_delayed_tree_ref *ref;
2332         u64 parent = 0;
2333         u64 ref_root = 0;
2334
2335         ref = btrfs_delayed_node_to_tree_ref(node);
2336         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2337
2338         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2339                 parent = ref->parent;
2340         ref_root = ref->root;
2341
2342         if (node->ref_mod != 1) {
2343                 btrfs_err(trans->fs_info,
2344         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2345                           node->bytenr, node->ref_mod, node->action, ref_root,
2346                           parent);
2347                 return -EIO;
2348         }
2349         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2350                 BUG_ON(!extent_op || !extent_op->update_flags);
2351                 ret = alloc_reserved_tree_block(trans, node, extent_op);
2352         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2353                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2354                                              ref->level, 0, 1, extent_op);
2355         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2356                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2357                                           ref->level, 0, 1, extent_op);
2358         } else {
2359                 BUG();
2360         }
2361         return ret;
2362 }
2363
2364 /* helper function to actually process a single delayed ref entry */
2365 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2366                                struct btrfs_delayed_ref_node *node,
2367                                struct btrfs_delayed_extent_op *extent_op,
2368                                int insert_reserved)
2369 {
2370         int ret = 0;
2371
2372         if (trans->aborted) {
2373                 if (insert_reserved)
2374                         btrfs_pin_extent(trans->fs_info, node->bytenr,
2375                                          node->num_bytes, 1);
2376                 return 0;
2377         }
2378
2379         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2380             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2381                 ret = run_delayed_tree_ref(trans, node, extent_op,
2382                                            insert_reserved);
2383         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2384                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2385                 ret = run_delayed_data_ref(trans, node, extent_op,
2386                                            insert_reserved);
2387         else
2388                 BUG();
2389         if (ret && insert_reserved)
2390                 btrfs_pin_extent(trans->fs_info, node->bytenr,
2391                                  node->num_bytes, 1);
2392         return ret;
2393 }
2394
2395 static inline struct btrfs_delayed_ref_node *
2396 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2397 {
2398         struct btrfs_delayed_ref_node *ref;
2399
2400         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2401                 return NULL;
2402
2403         /*
2404          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2405          * This is to prevent a ref count from going down to zero, which deletes
2406          * the extent item from the extent tree, when there still are references
2407          * to add, which would fail because they would not find the extent item.
2408          */
2409         if (!list_empty(&head->ref_add_list))
2410                 return list_first_entry(&head->ref_add_list,
2411                                 struct btrfs_delayed_ref_node, add_list);
2412
2413         ref = rb_entry(rb_first_cached(&head->ref_tree),
2414                        struct btrfs_delayed_ref_node, ref_node);
2415         ASSERT(list_empty(&ref->add_list));
2416         return ref;
2417 }
2418
2419 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2420                                       struct btrfs_delayed_ref_head *head)
2421 {
2422         spin_lock(&delayed_refs->lock);
2423         head->processing = 0;
2424         delayed_refs->num_heads_ready++;
2425         spin_unlock(&delayed_refs->lock);
2426         btrfs_delayed_ref_unlock(head);
2427 }
2428
2429 static struct btrfs_delayed_extent_op *cleanup_extent_op(
2430                                 struct btrfs_delayed_ref_head *head)
2431 {
2432         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2433
2434         if (!extent_op)
2435                 return NULL;
2436
2437         if (head->must_insert_reserved) {
2438                 head->extent_op = NULL;
2439                 btrfs_free_delayed_extent_op(extent_op);
2440                 return NULL;
2441         }
2442         return extent_op;
2443 }
2444
2445 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2446                                      struct btrfs_delayed_ref_head *head)
2447 {
2448         struct btrfs_delayed_extent_op *extent_op;
2449         int ret;
2450
2451         extent_op = cleanup_extent_op(head);
2452         if (!extent_op)
2453                 return 0;
2454         head->extent_op = NULL;
2455         spin_unlock(&head->lock);
2456         ret = run_delayed_extent_op(trans, head, extent_op);
2457         btrfs_free_delayed_extent_op(extent_op);
2458         return ret ? ret : 1;
2459 }
2460
2461 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2462                                   struct btrfs_delayed_ref_root *delayed_refs,
2463                                   struct btrfs_delayed_ref_head *head)
2464 {
2465         int nr_items = 1;       /* Dropping this ref head update. */
2466
2467         if (head->total_ref_mod < 0) {
2468                 struct btrfs_space_info *space_info;
2469                 u64 flags;
2470
2471                 if (head->is_data)
2472                         flags = BTRFS_BLOCK_GROUP_DATA;
2473                 else if (head->is_system)
2474                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
2475                 else
2476                         flags = BTRFS_BLOCK_GROUP_METADATA;
2477                 space_info = __find_space_info(fs_info, flags);
2478                 ASSERT(space_info);
2479                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2480                                    -head->num_bytes,
2481                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
2482
2483                 /*
2484                  * We had csum deletions accounted for in our delayed refs rsv,
2485                  * we need to drop the csum leaves for this update from our
2486                  * delayed_refs_rsv.
2487                  */
2488                 if (head->is_data) {
2489                         spin_lock(&delayed_refs->lock);
2490                         delayed_refs->pending_csums -= head->num_bytes;
2491                         spin_unlock(&delayed_refs->lock);
2492                         nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2493                                 head->num_bytes);
2494                 }
2495         }
2496
2497         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2498 }
2499
2500 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2501                             struct btrfs_delayed_ref_head *head)
2502 {
2503
2504         struct btrfs_fs_info *fs_info = trans->fs_info;
2505         struct btrfs_delayed_ref_root *delayed_refs;
2506         int ret;
2507
2508         delayed_refs = &trans->transaction->delayed_refs;
2509
2510         ret = run_and_cleanup_extent_op(trans, head);
2511         if (ret < 0) {
2512                 unselect_delayed_ref_head(delayed_refs, head);
2513                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2514                 return ret;
2515         } else if (ret) {
2516                 return ret;
2517         }
2518
2519         /*
2520          * Need to drop our head ref lock and re-acquire the delayed ref lock
2521          * and then re-check to make sure nobody got added.
2522          */
2523         spin_unlock(&head->lock);
2524         spin_lock(&delayed_refs->lock);
2525         spin_lock(&head->lock);
2526         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2527                 spin_unlock(&head->lock);
2528                 spin_unlock(&delayed_refs->lock);
2529                 return 1;
2530         }
2531         btrfs_delete_ref_head(delayed_refs, head);
2532         spin_unlock(&head->lock);
2533         spin_unlock(&delayed_refs->lock);
2534
2535         if (head->must_insert_reserved) {
2536                 btrfs_pin_extent(fs_info, head->bytenr,
2537                                  head->num_bytes, 1);
2538                 if (head->is_data) {
2539                         ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2540                                               head->num_bytes);
2541                 }
2542         }
2543
2544         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2545
2546         trace_run_delayed_ref_head(fs_info, head, 0);
2547         btrfs_delayed_ref_unlock(head);
2548         btrfs_put_delayed_ref_head(head);
2549         return 0;
2550 }
2551
2552 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2553                                         struct btrfs_trans_handle *trans)
2554 {
2555         struct btrfs_delayed_ref_root *delayed_refs =
2556                 &trans->transaction->delayed_refs;
2557         struct btrfs_delayed_ref_head *head = NULL;
2558         int ret;
2559
2560         spin_lock(&delayed_refs->lock);
2561         head = btrfs_select_ref_head(delayed_refs);
2562         if (!head) {
2563                 spin_unlock(&delayed_refs->lock);
2564                 return head;
2565         }
2566
2567         /*
2568          * Grab the lock that says we are going to process all the refs for
2569          * this head
2570          */
2571         ret = btrfs_delayed_ref_lock(delayed_refs, head);
2572         spin_unlock(&delayed_refs->lock);
2573
2574         /*
2575          * We may have dropped the spin lock to get the head mutex lock, and
2576          * that might have given someone else time to free the head.  If that's
2577          * true, it has been removed from our list and we can move on.
2578          */
2579         if (ret == -EAGAIN)
2580                 head = ERR_PTR(-EAGAIN);
2581
2582         return head;
2583 }
2584
2585 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2586                                     struct btrfs_delayed_ref_head *locked_ref,
2587                                     unsigned long *run_refs)
2588 {
2589         struct btrfs_fs_info *fs_info = trans->fs_info;
2590         struct btrfs_delayed_ref_root *delayed_refs;
2591         struct btrfs_delayed_extent_op *extent_op;
2592         struct btrfs_delayed_ref_node *ref;
2593         int must_insert_reserved = 0;
2594         int ret;
2595
2596         delayed_refs = &trans->transaction->delayed_refs;
2597
2598         lockdep_assert_held(&locked_ref->mutex);
2599         lockdep_assert_held(&locked_ref->lock);
2600
2601         while ((ref = select_delayed_ref(locked_ref))) {
2602                 if (ref->seq &&
2603                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
2604                         spin_unlock(&locked_ref->lock);
2605                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2606                         return -EAGAIN;
2607                 }
2608
2609                 (*run_refs)++;
2610                 ref->in_tree = 0;
2611                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2612                 RB_CLEAR_NODE(&ref->ref_node);
2613                 if (!list_empty(&ref->add_list))
2614                         list_del(&ref->add_list);
2615                 /*
2616                  * When we play the delayed ref, also correct the ref_mod on
2617                  * head
2618                  */
2619                 switch (ref->action) {
2620                 case BTRFS_ADD_DELAYED_REF:
2621                 case BTRFS_ADD_DELAYED_EXTENT:
2622                         locked_ref->ref_mod -= ref->ref_mod;
2623                         break;
2624                 case BTRFS_DROP_DELAYED_REF:
2625                         locked_ref->ref_mod += ref->ref_mod;
2626                         break;
2627                 default:
2628                         WARN_ON(1);
2629                 }
2630                 atomic_dec(&delayed_refs->num_entries);
2631
2632                 /*
2633                  * Record the must_insert_reserved flag before we drop the
2634                  * spin lock.
2635                  */
2636                 must_insert_reserved = locked_ref->must_insert_reserved;
2637                 locked_ref->must_insert_reserved = 0;
2638
2639                 extent_op = locked_ref->extent_op;
2640                 locked_ref->extent_op = NULL;
2641                 spin_unlock(&locked_ref->lock);
2642
2643                 ret = run_one_delayed_ref(trans, ref, extent_op,
2644                                           must_insert_reserved);
2645
2646                 btrfs_free_delayed_extent_op(extent_op);
2647                 if (ret) {
2648                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2649                         btrfs_put_delayed_ref(ref);
2650                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2651                                     ret);
2652                         return ret;
2653                 }
2654
2655                 btrfs_put_delayed_ref(ref);
2656                 cond_resched();
2657
2658                 spin_lock(&locked_ref->lock);
2659                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2660         }
2661
2662         return 0;
2663 }
2664
2665 /*
2666  * Returns 0 on success or if called with an already aborted transaction.
2667  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2668  */
2669 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2670                                              unsigned long nr)
2671 {
2672         struct btrfs_fs_info *fs_info = trans->fs_info;
2673         struct btrfs_delayed_ref_root *delayed_refs;
2674         struct btrfs_delayed_ref_head *locked_ref = NULL;
2675         ktime_t start = ktime_get();
2676         int ret;
2677         unsigned long count = 0;
2678         unsigned long actual_count = 0;
2679
2680         delayed_refs = &trans->transaction->delayed_refs;
2681         do {
2682                 if (!locked_ref) {
2683                         locked_ref = btrfs_obtain_ref_head(trans);
2684                         if (IS_ERR_OR_NULL(locked_ref)) {
2685                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2686                                         continue;
2687                                 } else {
2688                                         break;
2689                                 }
2690                         }
2691                         count++;
2692                 }
2693                 /*
2694                  * We need to try and merge add/drops of the same ref since we
2695                  * can run into issues with relocate dropping the implicit ref
2696                  * and then it being added back again before the drop can
2697                  * finish.  If we merged anything we need to re-loop so we can
2698                  * get a good ref.
2699                  * Or we can get node references of the same type that weren't
2700                  * merged when created due to bumps in the tree mod seq, and
2701                  * we need to merge them to prevent adding an inline extent
2702                  * backref before dropping it (triggering a BUG_ON at
2703                  * insert_inline_extent_backref()).
2704                  */
2705                 spin_lock(&locked_ref->lock);
2706                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2707
2708                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2709                                                       &actual_count);
2710                 if (ret < 0 && ret != -EAGAIN) {
2711                         /*
2712                          * Error, btrfs_run_delayed_refs_for_head already
2713                          * unlocked everything so just bail out
2714                          */
2715                         return ret;
2716                 } else if (!ret) {
2717                         /*
2718                          * Success, perform the usual cleanup of a processed
2719                          * head
2720                          */
2721                         ret = cleanup_ref_head(trans, locked_ref);
2722                         if (ret > 0 ) {
2723                                 /* We dropped our lock, we need to loop. */
2724                                 ret = 0;
2725                                 continue;
2726                         } else if (ret) {
2727                                 return ret;
2728                         }
2729                 }
2730
2731                 /*
2732                  * Either success case or btrfs_run_delayed_refs_for_head
2733                  * returned -EAGAIN, meaning we need to select another head
2734                  */
2735
2736                 locked_ref = NULL;
2737                 cond_resched();
2738         } while ((nr != -1 && count < nr) || locked_ref);
2739
2740         /*
2741          * We don't want to include ref heads since we can have empty ref heads
2742          * and those will drastically skew our runtime down since we just do
2743          * accounting, no actual extent tree updates.
2744          */
2745         if (actual_count > 0) {
2746                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2747                 u64 avg;
2748
2749                 /*
2750                  * We weigh the current average higher than our current runtime
2751                  * to avoid large swings in the average.
2752                  */
2753                 spin_lock(&delayed_refs->lock);
2754                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2755                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2756                 spin_unlock(&delayed_refs->lock);
2757         }
2758         return 0;
2759 }
2760
2761 #ifdef SCRAMBLE_DELAYED_REFS
2762 /*
2763  * Normally delayed refs get processed in ascending bytenr order. This
2764  * correlates in most cases to the order added. To expose dependencies on this
2765  * order, we start to process the tree in the middle instead of the beginning
2766  */
2767 static u64 find_middle(struct rb_root *root)
2768 {
2769         struct rb_node *n = root->rb_node;
2770         struct btrfs_delayed_ref_node *entry;
2771         int alt = 1;
2772         u64 middle;
2773         u64 first = 0, last = 0;
2774
2775         n = rb_first(root);
2776         if (n) {
2777                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2778                 first = entry->bytenr;
2779         }
2780         n = rb_last(root);
2781         if (n) {
2782                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2783                 last = entry->bytenr;
2784         }
2785         n = root->rb_node;
2786
2787         while (n) {
2788                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2789                 WARN_ON(!entry->in_tree);
2790
2791                 middle = entry->bytenr;
2792
2793                 if (alt)
2794                         n = n->rb_left;
2795                 else
2796                         n = n->rb_right;
2797
2798                 alt = 1 - alt;
2799         }
2800         return middle;
2801 }
2802 #endif
2803
2804 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2805 {
2806         u64 num_bytes;
2807
2808         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2809                              sizeof(struct btrfs_extent_inline_ref));
2810         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2811                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2812
2813         /*
2814          * We don't ever fill up leaves all the way so multiply by 2 just to be
2815          * closer to what we're really going to want to use.
2816          */
2817         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2818 }
2819
2820 /*
2821  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2822  * would require to store the csums for that many bytes.
2823  */
2824 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2825 {
2826         u64 csum_size;
2827         u64 num_csums_per_leaf;
2828         u64 num_csums;
2829
2830         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2831         num_csums_per_leaf = div64_u64(csum_size,
2832                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2833         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2834         num_csums += num_csums_per_leaf - 1;
2835         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2836         return num_csums;
2837 }
2838
2839 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2840 {
2841         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2842         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2843         bool ret = false;
2844         u64 reserved;
2845
2846         spin_lock(&global_rsv->lock);
2847         reserved = global_rsv->reserved;
2848         spin_unlock(&global_rsv->lock);
2849
2850         /*
2851          * Since the global reserve is just kind of magic we don't really want
2852          * to rely on it to save our bacon, so if our size is more than the
2853          * delayed_refs_rsv and the global rsv then it's time to think about
2854          * bailing.
2855          */
2856         spin_lock(&delayed_refs_rsv->lock);
2857         reserved += delayed_refs_rsv->reserved;
2858         if (delayed_refs_rsv->size >= reserved)
2859                 ret = true;
2860         spin_unlock(&delayed_refs_rsv->lock);
2861         return ret;
2862 }
2863
2864 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2865 {
2866         u64 num_entries =
2867                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2868         u64 avg_runtime;
2869         u64 val;
2870
2871         smp_mb();
2872         avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2873         val = num_entries * avg_runtime;
2874         if (val >= NSEC_PER_SEC)
2875                 return 1;
2876         if (val >= NSEC_PER_SEC / 2)
2877                 return 2;
2878
2879         return btrfs_check_space_for_delayed_refs(trans->fs_info);
2880 }
2881
2882 /*
2883  * this starts processing the delayed reference count updates and
2884  * extent insertions we have queued up so far.  count can be
2885  * 0, which means to process everything in the tree at the start
2886  * of the run (but not newly added entries), or it can be some target
2887  * number you'd like to process.
2888  *
2889  * Returns 0 on success or if called with an aborted transaction
2890  * Returns <0 on error and aborts the transaction
2891  */
2892 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2893                            unsigned long count)
2894 {
2895         struct btrfs_fs_info *fs_info = trans->fs_info;
2896         struct rb_node *node;
2897         struct btrfs_delayed_ref_root *delayed_refs;
2898         struct btrfs_delayed_ref_head *head;
2899         int ret;
2900         int run_all = count == (unsigned long)-1;
2901
2902         /* We'll clean this up in btrfs_cleanup_transaction */
2903         if (trans->aborted)
2904                 return 0;
2905
2906         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2907                 return 0;
2908
2909         delayed_refs = &trans->transaction->delayed_refs;
2910         if (count == 0)
2911                 count = atomic_read(&delayed_refs->num_entries) * 2;
2912
2913 again:
2914 #ifdef SCRAMBLE_DELAYED_REFS
2915         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2916 #endif
2917         ret = __btrfs_run_delayed_refs(trans, count);
2918         if (ret < 0) {
2919                 btrfs_abort_transaction(trans, ret);
2920                 return ret;
2921         }
2922
2923         if (run_all) {
2924                 btrfs_create_pending_block_groups(trans);
2925
2926                 spin_lock(&delayed_refs->lock);
2927                 node = rb_first_cached(&delayed_refs->href_root);
2928                 if (!node) {
2929                         spin_unlock(&delayed_refs->lock);
2930                         goto out;
2931                 }
2932                 head = rb_entry(node, struct btrfs_delayed_ref_head,
2933                                 href_node);
2934                 refcount_inc(&head->refs);
2935                 spin_unlock(&delayed_refs->lock);
2936
2937                 /* Mutex was contended, block until it's released and retry. */
2938                 mutex_lock(&head->mutex);
2939                 mutex_unlock(&head->mutex);
2940
2941                 btrfs_put_delayed_ref_head(head);
2942                 cond_resched();
2943                 goto again;
2944         }
2945 out:
2946         return 0;
2947 }
2948
2949 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2950                                 u64 bytenr, u64 num_bytes, u64 flags,
2951                                 int level, int is_data)
2952 {
2953         struct btrfs_delayed_extent_op *extent_op;
2954         int ret;
2955
2956         extent_op = btrfs_alloc_delayed_extent_op();
2957         if (!extent_op)
2958                 return -ENOMEM;
2959
2960         extent_op->flags_to_set = flags;
2961         extent_op->update_flags = true;
2962         extent_op->update_key = false;
2963         extent_op->is_data = is_data ? true : false;
2964         extent_op->level = level;
2965
2966         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2967         if (ret)
2968                 btrfs_free_delayed_extent_op(extent_op);
2969         return ret;
2970 }
2971
2972 static noinline int check_delayed_ref(struct btrfs_root *root,
2973                                       struct btrfs_path *path,
2974                                       u64 objectid, u64 offset, u64 bytenr)
2975 {
2976         struct btrfs_delayed_ref_head *head;
2977         struct btrfs_delayed_ref_node *ref;
2978         struct btrfs_delayed_data_ref *data_ref;
2979         struct btrfs_delayed_ref_root *delayed_refs;
2980         struct btrfs_transaction *cur_trans;
2981         struct rb_node *node;
2982         int ret = 0;
2983
2984         spin_lock(&root->fs_info->trans_lock);
2985         cur_trans = root->fs_info->running_transaction;
2986         if (cur_trans)
2987                 refcount_inc(&cur_trans->use_count);
2988         spin_unlock(&root->fs_info->trans_lock);
2989         if (!cur_trans)
2990                 return 0;
2991
2992         delayed_refs = &cur_trans->delayed_refs;
2993         spin_lock(&delayed_refs->lock);
2994         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2995         if (!head) {
2996                 spin_unlock(&delayed_refs->lock);
2997                 btrfs_put_transaction(cur_trans);
2998                 return 0;
2999         }
3000
3001         if (!mutex_trylock(&head->mutex)) {
3002                 refcount_inc(&head->refs);
3003                 spin_unlock(&delayed_refs->lock);
3004
3005                 btrfs_release_path(path);
3006
3007                 /*
3008                  * Mutex was contended, block until it's released and let
3009                  * caller try again
3010                  */
3011                 mutex_lock(&head->mutex);
3012                 mutex_unlock(&head->mutex);
3013                 btrfs_put_delayed_ref_head(head);
3014                 btrfs_put_transaction(cur_trans);
3015                 return -EAGAIN;
3016         }
3017         spin_unlock(&delayed_refs->lock);
3018
3019         spin_lock(&head->lock);
3020         /*
3021          * XXX: We should replace this with a proper search function in the
3022          * future.
3023          */
3024         for (node = rb_first_cached(&head->ref_tree); node;
3025              node = rb_next(node)) {
3026                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3027                 /* If it's a shared ref we know a cross reference exists */
3028                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3029                         ret = 1;
3030                         break;
3031                 }
3032
3033                 data_ref = btrfs_delayed_node_to_data_ref(ref);
3034
3035                 /*
3036                  * If our ref doesn't match the one we're currently looking at
3037                  * then we have a cross reference.
3038                  */
3039                 if (data_ref->root != root->root_key.objectid ||
3040                     data_ref->objectid != objectid ||
3041                     data_ref->offset != offset) {
3042                         ret = 1;
3043                         break;
3044                 }
3045         }
3046         spin_unlock(&head->lock);
3047         mutex_unlock(&head->mutex);
3048         btrfs_put_transaction(cur_trans);
3049         return ret;
3050 }
3051
3052 static noinline int check_committed_ref(struct btrfs_root *root,
3053                                         struct btrfs_path *path,
3054                                         u64 objectid, u64 offset, u64 bytenr)
3055 {
3056         struct btrfs_fs_info *fs_info = root->fs_info;
3057         struct btrfs_root *extent_root = fs_info->extent_root;
3058         struct extent_buffer *leaf;
3059         struct btrfs_extent_data_ref *ref;
3060         struct btrfs_extent_inline_ref *iref;
3061         struct btrfs_extent_item *ei;
3062         struct btrfs_key key;
3063         u32 item_size;
3064         int type;
3065         int ret;
3066
3067         key.objectid = bytenr;
3068         key.offset = (u64)-1;
3069         key.type = BTRFS_EXTENT_ITEM_KEY;
3070
3071         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3072         if (ret < 0)
3073                 goto out;
3074         BUG_ON(ret == 0); /* Corruption */
3075
3076         ret = -ENOENT;
3077         if (path->slots[0] == 0)
3078                 goto out;
3079
3080         path->slots[0]--;
3081         leaf = path->nodes[0];
3082         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3083
3084         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3085                 goto out;
3086
3087         ret = 1;
3088         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3089         ei = btrfs_item_ptr(leaf,&n