btrfs: extent-tree: Detect bytes_may_use underflow earlier
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "tree-log.h"
20 #include "disk-io.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "locking.h"
25 #include "free-space-cache.h"
26 #include "free-space-tree.h"
27 #include "math.h"
28 #include "sysfs.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31
32 #undef SCRAMBLE_DELAYED_REFS
33
34 /*
35  * control flags for do_chunk_alloc's force field
36  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37  * if we really need one.
38  *
39  * CHUNK_ALLOC_LIMITED means to only try and allocate one
40  * if we have very few chunks already allocated.  This is
41  * used as part of the clustering code to help make sure
42  * we have a good pool of storage to cluster in, without
43  * filling the FS with empty chunks
44  *
45  * CHUNK_ALLOC_FORCE means it must try to allocate one
46  *
47  */
48 enum {
49         CHUNK_ALLOC_NO_FORCE = 0,
50         CHUNK_ALLOC_LIMITED = 1,
51         CHUNK_ALLOC_FORCE = 2,
52 };
53
54 /*
55  * Declare a helper function to detect underflow of various space info members
56  */
57 #define DECLARE_SPACE_INFO_UPDATE(name)                                 \
58 static inline void update_##name(struct btrfs_space_info *sinfo,        \
59                                  s64 bytes)                             \
60 {                                                                       \
61         if (bytes < 0 && sinfo->name < -bytes) {                        \
62                 WARN_ON(1);                                             \
63                 sinfo->name = 0;                                        \
64                 return;                                                 \
65         }                                                               \
66         sinfo->name += bytes;                                           \
67 }
68
69 DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
70
71 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
72                                struct btrfs_delayed_ref_node *node, u64 parent,
73                                u64 root_objectid, u64 owner_objectid,
74                                u64 owner_offset, int refs_to_drop,
75                                struct btrfs_delayed_extent_op *extra_op);
76 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
77                                     struct extent_buffer *leaf,
78                                     struct btrfs_extent_item *ei);
79 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
80                                       u64 parent, u64 root_objectid,
81                                       u64 flags, u64 owner, u64 offset,
82                                       struct btrfs_key *ins, int ref_mod);
83 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
84                                      struct btrfs_delayed_ref_node *node,
85                                      struct btrfs_delayed_extent_op *extent_op);
86 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
87                           int force);
88 static int find_next_key(struct btrfs_path *path, int level,
89                          struct btrfs_key *key);
90 static void dump_space_info(struct btrfs_fs_info *fs_info,
91                             struct btrfs_space_info *info, u64 bytes,
92                             int dump_block_groups);
93 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
94                                u64 num_bytes);
95 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
96                                      struct btrfs_space_info *space_info,
97                                      u64 num_bytes);
98 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
99                                      struct btrfs_space_info *space_info,
100                                      u64 num_bytes);
101
102 static noinline int
103 block_group_cache_done(struct btrfs_block_group_cache *cache)
104 {
105         smp_mb();
106         return cache->cached == BTRFS_CACHE_FINISHED ||
107                 cache->cached == BTRFS_CACHE_ERROR;
108 }
109
110 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
111 {
112         return (cache->flags & bits) == bits;
113 }
114
115 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
116 {
117         atomic_inc(&cache->count);
118 }
119
120 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
121 {
122         if (atomic_dec_and_test(&cache->count)) {
123                 WARN_ON(cache->pinned > 0);
124                 WARN_ON(cache->reserved > 0);
125
126                 /*
127                  * If not empty, someone is still holding mutex of
128                  * full_stripe_lock, which can only be released by caller.
129                  * And it will definitely cause use-after-free when caller
130                  * tries to release full stripe lock.
131                  *
132                  * No better way to resolve, but only to warn.
133                  */
134                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
135                 kfree(cache->free_space_ctl);
136                 kfree(cache);
137         }
138 }
139
140 /*
141  * this adds the block group to the fs_info rb tree for the block group
142  * cache
143  */
144 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
145                                 struct btrfs_block_group_cache *block_group)
146 {
147         struct rb_node **p;
148         struct rb_node *parent = NULL;
149         struct btrfs_block_group_cache *cache;
150
151         spin_lock(&info->block_group_cache_lock);
152         p = &info->block_group_cache_tree.rb_node;
153
154         while (*p) {
155                 parent = *p;
156                 cache = rb_entry(parent, struct btrfs_block_group_cache,
157                                  cache_node);
158                 if (block_group->key.objectid < cache->key.objectid) {
159                         p = &(*p)->rb_left;
160                 } else if (block_group->key.objectid > cache->key.objectid) {
161                         p = &(*p)->rb_right;
162                 } else {
163                         spin_unlock(&info->block_group_cache_lock);
164                         return -EEXIST;
165                 }
166         }
167
168         rb_link_node(&block_group->cache_node, parent, p);
169         rb_insert_color(&block_group->cache_node,
170                         &info->block_group_cache_tree);
171
172         if (info->first_logical_byte > block_group->key.objectid)
173                 info->first_logical_byte = block_group->key.objectid;
174
175         spin_unlock(&info->block_group_cache_lock);
176
177         return 0;
178 }
179
180 /*
181  * This will return the block group at or after bytenr if contains is 0, else
182  * it will return the block group that contains the bytenr
183  */
184 static struct btrfs_block_group_cache *
185 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
186                               int contains)
187 {
188         struct btrfs_block_group_cache *cache, *ret = NULL;
189         struct rb_node *n;
190         u64 end, start;
191
192         spin_lock(&info->block_group_cache_lock);
193         n = info->block_group_cache_tree.rb_node;
194
195         while (n) {
196                 cache = rb_entry(n, struct btrfs_block_group_cache,
197                                  cache_node);
198                 end = cache->key.objectid + cache->key.offset - 1;
199                 start = cache->key.objectid;
200
201                 if (bytenr < start) {
202                         if (!contains && (!ret || start < ret->key.objectid))
203                                 ret = cache;
204                         n = n->rb_left;
205                 } else if (bytenr > start) {
206                         if (contains && bytenr <= end) {
207                                 ret = cache;
208                                 break;
209                         }
210                         n = n->rb_right;
211                 } else {
212                         ret = cache;
213                         break;
214                 }
215         }
216         if (ret) {
217                 btrfs_get_block_group(ret);
218                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
219                         info->first_logical_byte = ret->key.objectid;
220         }
221         spin_unlock(&info->block_group_cache_lock);
222
223         return ret;
224 }
225
226 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
227                                u64 start, u64 num_bytes)
228 {
229         u64 end = start + num_bytes - 1;
230         set_extent_bits(&fs_info->freed_extents[0],
231                         start, end, EXTENT_UPTODATE);
232         set_extent_bits(&fs_info->freed_extents[1],
233                         start, end, EXTENT_UPTODATE);
234         return 0;
235 }
236
237 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
238 {
239         struct btrfs_fs_info *fs_info = cache->fs_info;
240         u64 start, end;
241
242         start = cache->key.objectid;
243         end = start + cache->key.offset - 1;
244
245         clear_extent_bits(&fs_info->freed_extents[0],
246                           start, end, EXTENT_UPTODATE);
247         clear_extent_bits(&fs_info->freed_extents[1],
248                           start, end, EXTENT_UPTODATE);
249 }
250
251 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
252 {
253         struct btrfs_fs_info *fs_info = cache->fs_info;
254         u64 bytenr;
255         u64 *logical;
256         int stripe_len;
257         int i, nr, ret;
258
259         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
260                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
261                 cache->bytes_super += stripe_len;
262                 ret = add_excluded_extent(fs_info, cache->key.objectid,
263                                           stripe_len);
264                 if (ret)
265                         return ret;
266         }
267
268         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
269                 bytenr = btrfs_sb_offset(i);
270                 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
271                                        bytenr, &logical, &nr, &stripe_len);
272                 if (ret)
273                         return ret;
274
275                 while (nr--) {
276                         u64 start, len;
277
278                         if (logical[nr] > cache->key.objectid +
279                             cache->key.offset)
280                                 continue;
281
282                         if (logical[nr] + stripe_len <= cache->key.objectid)
283                                 continue;
284
285                         start = logical[nr];
286                         if (start < cache->key.objectid) {
287                                 start = cache->key.objectid;
288                                 len = (logical[nr] + stripe_len) - start;
289                         } else {
290                                 len = min_t(u64, stripe_len,
291                                             cache->key.objectid +
292                                             cache->key.offset - start);
293                         }
294
295                         cache->bytes_super += len;
296                         ret = add_excluded_extent(fs_info, start, len);
297                         if (ret) {
298                                 kfree(logical);
299                                 return ret;
300                         }
301                 }
302
303                 kfree(logical);
304         }
305         return 0;
306 }
307
308 static struct btrfs_caching_control *
309 get_caching_control(struct btrfs_block_group_cache *cache)
310 {
311         struct btrfs_caching_control *ctl;
312
313         spin_lock(&cache->lock);
314         if (!cache->caching_ctl) {
315                 spin_unlock(&cache->lock);
316                 return NULL;
317         }
318
319         ctl = cache->caching_ctl;
320         refcount_inc(&ctl->count);
321         spin_unlock(&cache->lock);
322         return ctl;
323 }
324
325 static void put_caching_control(struct btrfs_caching_control *ctl)
326 {
327         if (refcount_dec_and_test(&ctl->count))
328                 kfree(ctl);
329 }
330
331 #ifdef CONFIG_BTRFS_DEBUG
332 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
333 {
334         struct btrfs_fs_info *fs_info = block_group->fs_info;
335         u64 start = block_group->key.objectid;
336         u64 len = block_group->key.offset;
337         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
338                 fs_info->nodesize : fs_info->sectorsize;
339         u64 step = chunk << 1;
340
341         while (len > chunk) {
342                 btrfs_remove_free_space(block_group, start, chunk);
343                 start += step;
344                 if (len < step)
345                         len = 0;
346                 else
347                         len -= step;
348         }
349 }
350 #endif
351
352 /*
353  * this is only called by cache_block_group, since we could have freed extents
354  * we need to check the pinned_extents for any extents that can't be used yet
355  * since their free space will be released as soon as the transaction commits.
356  */
357 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
358                        u64 start, u64 end)
359 {
360         struct btrfs_fs_info *info = block_group->fs_info;
361         u64 extent_start, extent_end, size, total_added = 0;
362         int ret;
363
364         while (start < end) {
365                 ret = find_first_extent_bit(info->pinned_extents, start,
366                                             &extent_start, &extent_end,
367                                             EXTENT_DIRTY | EXTENT_UPTODATE,
368                                             NULL);
369                 if (ret)
370                         break;
371
372                 if (extent_start <= start) {
373                         start = extent_end + 1;
374                 } else if (extent_start > start && extent_start < end) {
375                         size = extent_start - start;
376                         total_added += size;
377                         ret = btrfs_add_free_space(block_group, start,
378                                                    size);
379                         BUG_ON(ret); /* -ENOMEM or logic error */
380                         start = extent_end + 1;
381                 } else {
382                         break;
383                 }
384         }
385
386         if (start < end) {
387                 size = end - start;
388                 total_added += size;
389                 ret = btrfs_add_free_space(block_group, start, size);
390                 BUG_ON(ret); /* -ENOMEM or logic error */
391         }
392
393         return total_added;
394 }
395
396 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
397 {
398         struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
399         struct btrfs_fs_info *fs_info = block_group->fs_info;
400         struct btrfs_root *extent_root = fs_info->extent_root;
401         struct btrfs_path *path;
402         struct extent_buffer *leaf;
403         struct btrfs_key key;
404         u64 total_found = 0;
405         u64 last = 0;
406         u32 nritems;
407         int ret;
408         bool wakeup = true;
409
410         path = btrfs_alloc_path();
411         if (!path)
412                 return -ENOMEM;
413
414         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
415
416 #ifdef CONFIG_BTRFS_DEBUG
417         /*
418          * If we're fragmenting we don't want to make anybody think we can
419          * allocate from this block group until we've had a chance to fragment
420          * the free space.
421          */
422         if (btrfs_should_fragment_free_space(block_group))
423                 wakeup = false;
424 #endif
425         /*
426          * We don't want to deadlock with somebody trying to allocate a new
427          * extent for the extent root while also trying to search the extent
428          * root to add free space.  So we skip locking and search the commit
429          * root, since its read-only
430          */
431         path->skip_locking = 1;
432         path->search_commit_root = 1;
433         path->reada = READA_FORWARD;
434
435         key.objectid = last;
436         key.offset = 0;
437         key.type = BTRFS_EXTENT_ITEM_KEY;
438
439 next:
440         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
441         if (ret < 0)
442                 goto out;
443
444         leaf = path->nodes[0];
445         nritems = btrfs_header_nritems(leaf);
446
447         while (1) {
448                 if (btrfs_fs_closing(fs_info) > 1) {
449                         last = (u64)-1;
450                         break;
451                 }
452
453                 if (path->slots[0] < nritems) {
454                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
455                 } else {
456                         ret = find_next_key(path, 0, &key);
457                         if (ret)
458                                 break;
459
460                         if (need_resched() ||
461                             rwsem_is_contended(&fs_info->commit_root_sem)) {
462                                 if (wakeup)
463                                         caching_ctl->progress = last;
464                                 btrfs_release_path(path);
465                                 up_read(&fs_info->commit_root_sem);
466                                 mutex_unlock(&caching_ctl->mutex);
467                                 cond_resched();
468                                 mutex_lock(&caching_ctl->mutex);
469                                 down_read(&fs_info->commit_root_sem);
470                                 goto next;
471                         }
472
473                         ret = btrfs_next_leaf(extent_root, path);
474                         if (ret < 0)
475                                 goto out;
476                         if (ret)
477                                 break;
478                         leaf = path->nodes[0];
479                         nritems = btrfs_header_nritems(leaf);
480                         continue;
481                 }
482
483                 if (key.objectid < last) {
484                         key.objectid = last;
485                         key.offset = 0;
486                         key.type = BTRFS_EXTENT_ITEM_KEY;
487
488                         if (wakeup)
489                                 caching_ctl->progress = last;
490                         btrfs_release_path(path);
491                         goto next;
492                 }
493
494                 if (key.objectid < block_group->key.objectid) {
495                         path->slots[0]++;
496                         continue;
497                 }
498
499                 if (key.objectid >= block_group->key.objectid +
500                     block_group->key.offset)
501                         break;
502
503                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
504                     key.type == BTRFS_METADATA_ITEM_KEY) {
505                         total_found += add_new_free_space(block_group, last,
506                                                           key.objectid);
507                         if (key.type == BTRFS_METADATA_ITEM_KEY)
508                                 last = key.objectid +
509                                         fs_info->nodesize;
510                         else
511                                 last = key.objectid + key.offset;
512
513                         if (total_found > CACHING_CTL_WAKE_UP) {
514                                 total_found = 0;
515                                 if (wakeup)
516                                         wake_up(&caching_ctl->wait);
517                         }
518                 }
519                 path->slots[0]++;
520         }
521         ret = 0;
522
523         total_found += add_new_free_space(block_group, last,
524                                           block_group->key.objectid +
525                                           block_group->key.offset);
526         caching_ctl->progress = (u64)-1;
527
528 out:
529         btrfs_free_path(path);
530         return ret;
531 }
532
533 static noinline void caching_thread(struct btrfs_work *work)
534 {
535         struct btrfs_block_group_cache *block_group;
536         struct btrfs_fs_info *fs_info;
537         struct btrfs_caching_control *caching_ctl;
538         int ret;
539
540         caching_ctl = container_of(work, struct btrfs_caching_control, work);
541         block_group = caching_ctl->block_group;
542         fs_info = block_group->fs_info;
543
544         mutex_lock(&caching_ctl->mutex);
545         down_read(&fs_info->commit_root_sem);
546
547         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
548                 ret = load_free_space_tree(caching_ctl);
549         else
550                 ret = load_extent_tree_free(caching_ctl);
551
552         spin_lock(&block_group->lock);
553         block_group->caching_ctl = NULL;
554         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
555         spin_unlock(&block_group->lock);
556
557 #ifdef CONFIG_BTRFS_DEBUG
558         if (btrfs_should_fragment_free_space(block_group)) {
559                 u64 bytes_used;
560
561                 spin_lock(&block_group->space_info->lock);
562                 spin_lock(&block_group->lock);
563                 bytes_used = block_group->key.offset -
564                         btrfs_block_group_used(&block_group->item);
565                 block_group->space_info->bytes_used += bytes_used >> 1;
566                 spin_unlock(&block_group->lock);
567                 spin_unlock(&block_group->space_info->lock);
568                 fragment_free_space(block_group);
569         }
570 #endif
571
572         caching_ctl->progress = (u64)-1;
573
574         up_read(&fs_info->commit_root_sem);
575         free_excluded_extents(block_group);
576         mutex_unlock(&caching_ctl->mutex);
577
578         wake_up(&caching_ctl->wait);
579
580         put_caching_control(caching_ctl);
581         btrfs_put_block_group(block_group);
582 }
583
584 static int cache_block_group(struct btrfs_block_group_cache *cache,
585                              int load_cache_only)
586 {
587         DEFINE_WAIT(wait);
588         struct btrfs_fs_info *fs_info = cache->fs_info;
589         struct btrfs_caching_control *caching_ctl;
590         int ret = 0;
591
592         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
593         if (!caching_ctl)
594                 return -ENOMEM;
595
596         INIT_LIST_HEAD(&caching_ctl->list);
597         mutex_init(&caching_ctl->mutex);
598         init_waitqueue_head(&caching_ctl->wait);
599         caching_ctl->block_group = cache;
600         caching_ctl->progress = cache->key.objectid;
601         refcount_set(&caching_ctl->count, 1);
602         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
603                         caching_thread, NULL, NULL);
604
605         spin_lock(&cache->lock);
606         /*
607          * This should be a rare occasion, but this could happen I think in the
608          * case where one thread starts to load the space cache info, and then
609          * some other thread starts a transaction commit which tries to do an
610          * allocation while the other thread is still loading the space cache
611          * info.  The previous loop should have kept us from choosing this block
612          * group, but if we've moved to the state where we will wait on caching
613          * block groups we need to first check if we're doing a fast load here,
614          * so we can wait for it to finish, otherwise we could end up allocating
615          * from a block group who's cache gets evicted for one reason or
616          * another.
617          */
618         while (cache->cached == BTRFS_CACHE_FAST) {
619                 struct btrfs_caching_control *ctl;
620
621                 ctl = cache->caching_ctl;
622                 refcount_inc(&ctl->count);
623                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
624                 spin_unlock(&cache->lock);
625
626                 schedule();
627
628                 finish_wait(&ctl->wait, &wait);
629                 put_caching_control(ctl);
630                 spin_lock(&cache->lock);
631         }
632
633         if (cache->cached != BTRFS_CACHE_NO) {
634                 spin_unlock(&cache->lock);
635                 kfree(caching_ctl);
636                 return 0;
637         }
638         WARN_ON(cache->caching_ctl);
639         cache->caching_ctl = caching_ctl;
640         cache->cached = BTRFS_CACHE_FAST;
641         spin_unlock(&cache->lock);
642
643         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
644                 mutex_lock(&caching_ctl->mutex);
645                 ret = load_free_space_cache(fs_info, cache);
646
647                 spin_lock(&cache->lock);
648                 if (ret == 1) {
649                         cache->caching_ctl = NULL;
650                         cache->cached = BTRFS_CACHE_FINISHED;
651                         cache->last_byte_to_unpin = (u64)-1;
652                         caching_ctl->progress = (u64)-1;
653                 } else {
654                         if (load_cache_only) {
655                                 cache->caching_ctl = NULL;
656                                 cache->cached = BTRFS_CACHE_NO;
657                         } else {
658                                 cache->cached = BTRFS_CACHE_STARTED;
659                                 cache->has_caching_ctl = 1;
660                         }
661                 }
662                 spin_unlock(&cache->lock);
663 #ifdef CONFIG_BTRFS_DEBUG
664                 if (ret == 1 &&
665                     btrfs_should_fragment_free_space(cache)) {
666                         u64 bytes_used;
667
668                         spin_lock(&cache->space_info->lock);
669                         spin_lock(&cache->lock);
670                         bytes_used = cache->key.offset -
671                                 btrfs_block_group_used(&cache->item);
672                         cache->space_info->bytes_used += bytes_used >> 1;
673                         spin_unlock(&cache->lock);
674                         spin_unlock(&cache->space_info->lock);
675                         fragment_free_space(cache);
676                 }
677 #endif
678                 mutex_unlock(&caching_ctl->mutex);
679
680                 wake_up(&caching_ctl->wait);
681                 if (ret == 1) {
682                         put_caching_control(caching_ctl);
683                         free_excluded_extents(cache);
684                         return 0;
685                 }
686         } else {
687                 /*
688                  * We're either using the free space tree or no caching at all.
689                  * Set cached to the appropriate value and wakeup any waiters.
690                  */
691                 spin_lock(&cache->lock);
692                 if (load_cache_only) {
693                         cache->caching_ctl = NULL;
694                         cache->cached = BTRFS_CACHE_NO;
695                 } else {
696                         cache->cached = BTRFS_CACHE_STARTED;
697                         cache->has_caching_ctl = 1;
698                 }
699                 spin_unlock(&cache->lock);
700                 wake_up(&caching_ctl->wait);
701         }
702
703         if (load_cache_only) {
704                 put_caching_control(caching_ctl);
705                 return 0;
706         }
707
708         down_write(&fs_info->commit_root_sem);
709         refcount_inc(&caching_ctl->count);
710         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
711         up_write(&fs_info->commit_root_sem);
712
713         btrfs_get_block_group(cache);
714
715         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
716
717         return ret;
718 }
719
720 /*
721  * return the block group that starts at or after bytenr
722  */
723 static struct btrfs_block_group_cache *
724 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
725 {
726         return block_group_cache_tree_search(info, bytenr, 0);
727 }
728
729 /*
730  * return the block group that contains the given bytenr
731  */
732 struct btrfs_block_group_cache *btrfs_lookup_block_group(
733                                                  struct btrfs_fs_info *info,
734                                                  u64 bytenr)
735 {
736         return block_group_cache_tree_search(info, bytenr, 1);
737 }
738
739 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
740                                                   u64 flags)
741 {
742         struct list_head *head = &info->space_info;
743         struct btrfs_space_info *found;
744
745         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
746
747         rcu_read_lock();
748         list_for_each_entry_rcu(found, head, list) {
749                 if (found->flags & flags) {
750                         rcu_read_unlock();
751                         return found;
752                 }
753         }
754         rcu_read_unlock();
755         return NULL;
756 }
757
758 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
759                              bool metadata, u64 root_objectid)
760 {
761         struct btrfs_space_info *space_info;
762         u64 flags;
763
764         if (metadata) {
765                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
766                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
767                 else
768                         flags = BTRFS_BLOCK_GROUP_METADATA;
769         } else {
770                 flags = BTRFS_BLOCK_GROUP_DATA;
771         }
772
773         space_info = __find_space_info(fs_info, flags);
774         ASSERT(space_info);
775         percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
776                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
777 }
778
779 /*
780  * after adding space to the filesystem, we need to clear the full flags
781  * on all the space infos.
782  */
783 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
784 {
785         struct list_head *head = &info->space_info;
786         struct btrfs_space_info *found;
787
788         rcu_read_lock();
789         list_for_each_entry_rcu(found, head, list)
790                 found->full = 0;
791         rcu_read_unlock();
792 }
793
794 /* simple helper to search for an existing data extent at a given offset */
795 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
796 {
797         int ret;
798         struct btrfs_key key;
799         struct btrfs_path *path;
800
801         path = btrfs_alloc_path();
802         if (!path)
803                 return -ENOMEM;
804
805         key.objectid = start;
806         key.offset = len;
807         key.type = BTRFS_EXTENT_ITEM_KEY;
808         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
809         btrfs_free_path(path);
810         return ret;
811 }
812
813 /*
814  * helper function to lookup reference count and flags of a tree block.
815  *
816  * the head node for delayed ref is used to store the sum of all the
817  * reference count modifications queued up in the rbtree. the head
818  * node may also store the extent flags to set. This way you can check
819  * to see what the reference count and extent flags would be if all of
820  * the delayed refs are not processed.
821  */
822 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
823                              struct btrfs_fs_info *fs_info, u64 bytenr,
824                              u64 offset, int metadata, u64 *refs, u64 *flags)
825 {
826         struct btrfs_delayed_ref_head *head;
827         struct btrfs_delayed_ref_root *delayed_refs;
828         struct btrfs_path *path;
829         struct btrfs_extent_item *ei;
830         struct extent_buffer *leaf;
831         struct btrfs_key key;
832         u32 item_size;
833         u64 num_refs;
834         u64 extent_flags;
835         int ret;
836
837         /*
838          * If we don't have skinny metadata, don't bother doing anything
839          * different
840          */
841         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
842                 offset = fs_info->nodesize;
843                 metadata = 0;
844         }
845
846         path = btrfs_alloc_path();
847         if (!path)
848                 return -ENOMEM;
849
850         if (!trans) {
851                 path->skip_locking = 1;
852                 path->search_commit_root = 1;
853         }
854
855 search_again:
856         key.objectid = bytenr;
857         key.offset = offset;
858         if (metadata)
859                 key.type = BTRFS_METADATA_ITEM_KEY;
860         else
861                 key.type = BTRFS_EXTENT_ITEM_KEY;
862
863         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
864         if (ret < 0)
865                 goto out_free;
866
867         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
868                 if (path->slots[0]) {
869                         path->slots[0]--;
870                         btrfs_item_key_to_cpu(path->nodes[0], &key,
871                                               path->slots[0]);
872                         if (key.objectid == bytenr &&
873                             key.type == BTRFS_EXTENT_ITEM_KEY &&
874                             key.offset == fs_info->nodesize)
875                                 ret = 0;
876                 }
877         }
878
879         if (ret == 0) {
880                 leaf = path->nodes[0];
881                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
882                 if (item_size >= sizeof(*ei)) {
883                         ei = btrfs_item_ptr(leaf, path->slots[0],
884                                             struct btrfs_extent_item);
885                         num_refs = btrfs_extent_refs(leaf, ei);
886                         extent_flags = btrfs_extent_flags(leaf, ei);
887                 } else {
888                         ret = -EINVAL;
889                         btrfs_print_v0_err(fs_info);
890                         if (trans)
891                                 btrfs_abort_transaction(trans, ret);
892                         else
893                                 btrfs_handle_fs_error(fs_info, ret, NULL);
894
895                         goto out_free;
896                 }
897
898                 BUG_ON(num_refs == 0);
899         } else {
900                 num_refs = 0;
901                 extent_flags = 0;
902                 ret = 0;
903         }
904
905         if (!trans)
906                 goto out;
907
908         delayed_refs = &trans->transaction->delayed_refs;
909         spin_lock(&delayed_refs->lock);
910         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
911         if (head) {
912                 if (!mutex_trylock(&head->mutex)) {
913                         refcount_inc(&head->refs);
914                         spin_unlock(&delayed_refs->lock);
915
916                         btrfs_release_path(path);
917
918                         /*
919                          * Mutex was contended, block until it's released and try
920                          * again
921                          */
922                         mutex_lock(&head->mutex);
923                         mutex_unlock(&head->mutex);
924                         btrfs_put_delayed_ref_head(head);
925                         goto search_again;
926                 }
927                 spin_lock(&head->lock);
928                 if (head->extent_op && head->extent_op->update_flags)
929                         extent_flags |= head->extent_op->flags_to_set;
930                 else
931                         BUG_ON(num_refs == 0);
932
933                 num_refs += head->ref_mod;
934                 spin_unlock(&head->lock);
935                 mutex_unlock(&head->mutex);
936         }
937         spin_unlock(&delayed_refs->lock);
938 out:
939         WARN_ON(num_refs == 0);
940         if (refs)
941                 *refs = num_refs;
942         if (flags)
943                 *flags = extent_flags;
944 out_free:
945         btrfs_free_path(path);
946         return ret;
947 }
948
949 /*
950  * Back reference rules.  Back refs have three main goals:
951  *
952  * 1) differentiate between all holders of references to an extent so that
953  *    when a reference is dropped we can make sure it was a valid reference
954  *    before freeing the extent.
955  *
956  * 2) Provide enough information to quickly find the holders of an extent
957  *    if we notice a given block is corrupted or bad.
958  *
959  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
960  *    maintenance.  This is actually the same as #2, but with a slightly
961  *    different use case.
962  *
963  * There are two kinds of back refs. The implicit back refs is optimized
964  * for pointers in non-shared tree blocks. For a given pointer in a block,
965  * back refs of this kind provide information about the block's owner tree
966  * and the pointer's key. These information allow us to find the block by
967  * b-tree searching. The full back refs is for pointers in tree blocks not
968  * referenced by their owner trees. The location of tree block is recorded
969  * in the back refs. Actually the full back refs is generic, and can be
970  * used in all cases the implicit back refs is used. The major shortcoming
971  * of the full back refs is its overhead. Every time a tree block gets
972  * COWed, we have to update back refs entry for all pointers in it.
973  *
974  * For a newly allocated tree block, we use implicit back refs for
975  * pointers in it. This means most tree related operations only involve
976  * implicit back refs. For a tree block created in old transaction, the
977  * only way to drop a reference to it is COW it. So we can detect the
978  * event that tree block loses its owner tree's reference and do the
979  * back refs conversion.
980  *
981  * When a tree block is COWed through a tree, there are four cases:
982  *
983  * The reference count of the block is one and the tree is the block's
984  * owner tree. Nothing to do in this case.
985  *
986  * The reference count of the block is one and the tree is not the
987  * block's owner tree. In this case, full back refs is used for pointers
988  * in the block. Remove these full back refs, add implicit back refs for
989  * every pointers in the new block.
990  *
991  * The reference count of the block is greater than one and the tree is
992  * the block's owner tree. In this case, implicit back refs is used for
993  * pointers in the block. Add full back refs for every pointers in the
994  * block, increase lower level extents' reference counts. The original
995  * implicit back refs are entailed to the new block.
996  *
997  * The reference count of the block is greater than one and the tree is
998  * not the block's owner tree. Add implicit back refs for every pointer in
999  * the new block, increase lower level extents' reference count.
1000  *
1001  * Back Reference Key composing:
1002  *
1003  * The key objectid corresponds to the first byte in the extent,
1004  * The key type is used to differentiate between types of back refs.
1005  * There are different meanings of the key offset for different types
1006  * of back refs.
1007  *
1008  * File extents can be referenced by:
1009  *
1010  * - multiple snapshots, subvolumes, or different generations in one subvol
1011  * - different files inside a single subvolume
1012  * - different offsets inside a file (bookend extents in file.c)
1013  *
1014  * The extent ref structure for the implicit back refs has fields for:
1015  *
1016  * - Objectid of the subvolume root
1017  * - objectid of the file holding the reference
1018  * - original offset in the file
1019  * - how many bookend extents
1020  *
1021  * The key offset for the implicit back refs is hash of the first
1022  * three fields.
1023  *
1024  * The extent ref structure for the full back refs has field for:
1025  *
1026  * - number of pointers in the tree leaf
1027  *
1028  * The key offset for the implicit back refs is the first byte of
1029  * the tree leaf
1030  *
1031  * When a file extent is allocated, The implicit back refs is used.
1032  * the fields are filled in:
1033  *
1034  *     (root_key.objectid, inode objectid, offset in file, 1)
1035  *
1036  * When a file extent is removed file truncation, we find the
1037  * corresponding implicit back refs and check the following fields:
1038  *
1039  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1040  *
1041  * Btree extents can be referenced by:
1042  *
1043  * - Different subvolumes
1044  *
1045  * Both the implicit back refs and the full back refs for tree blocks
1046  * only consist of key. The key offset for the implicit back refs is
1047  * objectid of block's owner tree. The key offset for the full back refs
1048  * is the first byte of parent block.
1049  *
1050  * When implicit back refs is used, information about the lowest key and
1051  * level of the tree block are required. These information are stored in
1052  * tree block info structure.
1053  */
1054
1055 /*
1056  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1057  * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
1058  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1059  */
1060 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1061                                      struct btrfs_extent_inline_ref *iref,
1062                                      enum btrfs_inline_ref_type is_data)
1063 {
1064         int type = btrfs_extent_inline_ref_type(eb, iref);
1065         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1066
1067         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1068             type == BTRFS_SHARED_BLOCK_REF_KEY ||
1069             type == BTRFS_SHARED_DATA_REF_KEY ||
1070             type == BTRFS_EXTENT_DATA_REF_KEY) {
1071                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1072                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
1073                                 return type;
1074                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1075                                 ASSERT(eb->fs_info);
1076                                 /*
1077                                  * Every shared one has parent tree
1078                                  * block, which must be aligned to
1079                                  * nodesize.
1080                                  */
1081                                 if (offset &&
1082                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1083                                         return type;
1084                         }
1085                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1086                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
1087                                 return type;
1088                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
1089                                 ASSERT(eb->fs_info);
1090                                 /*
1091                                  * Every shared one has parent tree
1092                                  * block, which must be aligned to
1093                                  * nodesize.
1094                                  */
1095                                 if (offset &&
1096                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1097                                         return type;
1098                         }
1099                 } else {
1100                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1101                         return type;
1102                 }
1103         }
1104
1105         btrfs_print_leaf((struct extent_buffer *)eb);
1106         btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1107                   eb->start, type);
1108         WARN_ON(1);
1109
1110         return BTRFS_REF_TYPE_INVALID;
1111 }
1112
1113 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1114 {
1115         u32 high_crc = ~(u32)0;
1116         u32 low_crc = ~(u32)0;
1117         __le64 lenum;
1118
1119         lenum = cpu_to_le64(root_objectid);
1120         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1121         lenum = cpu_to_le64(owner);
1122         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1123         lenum = cpu_to_le64(offset);
1124         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1125
1126         return ((u64)high_crc << 31) ^ (u64)low_crc;
1127 }
1128
1129 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1130                                      struct btrfs_extent_data_ref *ref)
1131 {
1132         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1133                                     btrfs_extent_data_ref_objectid(leaf, ref),
1134                                     btrfs_extent_data_ref_offset(leaf, ref));
1135 }
1136
1137 static int match_extent_data_ref(struct extent_buffer *leaf,
1138                                  struct btrfs_extent_data_ref *ref,
1139                                  u64 root_objectid, u64 owner, u64 offset)
1140 {
1141         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1142             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1143             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1144                 return 0;
1145         return 1;
1146 }
1147
1148 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1149                                            struct btrfs_path *path,
1150                                            u64 bytenr, u64 parent,
1151                                            u64 root_objectid,
1152                                            u64 owner, u64 offset)
1153 {
1154         struct btrfs_root *root = trans->fs_info->extent_root;
1155         struct btrfs_key key;
1156         struct btrfs_extent_data_ref *ref;
1157         struct extent_buffer *leaf;
1158         u32 nritems;
1159         int ret;
1160         int recow;
1161         int err = -ENOENT;
1162
1163         key.objectid = bytenr;
1164         if (parent) {
1165                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1166                 key.offset = parent;
1167         } else {
1168                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1169                 key.offset = hash_extent_data_ref(root_objectid,
1170                                                   owner, offset);
1171         }
1172 again:
1173         recow = 0;
1174         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1175         if (ret < 0) {
1176                 err = ret;
1177                 goto fail;
1178         }
1179
1180         if (parent) {
1181                 if (!ret)
1182                         return 0;
1183                 goto fail;
1184         }
1185
1186         leaf = path->nodes[0];
1187         nritems = btrfs_header_nritems(leaf);
1188         while (1) {
1189                 if (path->slots[0] >= nritems) {
1190                         ret = btrfs_next_leaf(root, path);
1191                         if (ret < 0)
1192                                 err = ret;
1193                         if (ret)
1194                                 goto fail;
1195
1196                         leaf = path->nodes[0];
1197                         nritems = btrfs_header_nritems(leaf);
1198                         recow = 1;
1199                 }
1200
1201                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1202                 if (key.objectid != bytenr ||
1203                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1204                         goto fail;
1205
1206                 ref = btrfs_item_ptr(leaf, path->slots[0],
1207                                      struct btrfs_extent_data_ref);
1208
1209                 if (match_extent_data_ref(leaf, ref, root_objectid,
1210                                           owner, offset)) {
1211                         if (recow) {
1212                                 btrfs_release_path(path);
1213                                 goto again;
1214                         }
1215                         err = 0;
1216                         break;
1217                 }
1218                 path->slots[0]++;
1219         }
1220 fail:
1221         return err;
1222 }
1223
1224 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1225                                            struct btrfs_path *path,
1226                                            u64 bytenr, u64 parent,
1227                                            u64 root_objectid, u64 owner,
1228                                            u64 offset, int refs_to_add)
1229 {
1230         struct btrfs_root *root = trans->fs_info->extent_root;
1231         struct btrfs_key key;
1232         struct extent_buffer *leaf;
1233         u32 size;
1234         u32 num_refs;
1235         int ret;
1236
1237         key.objectid = bytenr;
1238         if (parent) {
1239                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1240                 key.offset = parent;
1241                 size = sizeof(struct btrfs_shared_data_ref);
1242         } else {
1243                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1244                 key.offset = hash_extent_data_ref(root_objectid,
1245                                                   owner, offset);
1246                 size = sizeof(struct btrfs_extent_data_ref);
1247         }
1248
1249         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1250         if (ret && ret != -EEXIST)
1251                 goto fail;
1252
1253         leaf = path->nodes[0];
1254         if (parent) {
1255                 struct btrfs_shared_data_ref *ref;
1256                 ref = btrfs_item_ptr(leaf, path->slots[0],
1257                                      struct btrfs_shared_data_ref);
1258                 if (ret == 0) {
1259                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1260                 } else {
1261                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1262                         num_refs += refs_to_add;
1263                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1264                 }
1265         } else {
1266                 struct btrfs_extent_data_ref *ref;
1267                 while (ret == -EEXIST) {
1268                         ref = btrfs_item_ptr(leaf, path->slots[0],
1269                                              struct btrfs_extent_data_ref);
1270                         if (match_extent_data_ref(leaf, ref, root_objectid,
1271                                                   owner, offset))
1272                                 break;
1273                         btrfs_release_path(path);
1274                         key.offset++;
1275                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1276                                                       size);
1277                         if (ret && ret != -EEXIST)
1278                                 goto fail;
1279
1280                         leaf = path->nodes[0];
1281                 }
1282                 ref = btrfs_item_ptr(leaf, path->slots[0],
1283                                      struct btrfs_extent_data_ref);
1284                 if (ret == 0) {
1285                         btrfs_set_extent_data_ref_root(leaf, ref,
1286                                                        root_objectid);
1287                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1288                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1289                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1290                 } else {
1291                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1292                         num_refs += refs_to_add;
1293                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1294                 }
1295         }
1296         btrfs_mark_buffer_dirty(leaf);
1297         ret = 0;
1298 fail:
1299         btrfs_release_path(path);
1300         return ret;
1301 }
1302
1303 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1304                                            struct btrfs_path *path,
1305                                            int refs_to_drop, int *last_ref)
1306 {
1307         struct btrfs_key key;
1308         struct btrfs_extent_data_ref *ref1 = NULL;
1309         struct btrfs_shared_data_ref *ref2 = NULL;
1310         struct extent_buffer *leaf;
1311         u32 num_refs = 0;
1312         int ret = 0;
1313
1314         leaf = path->nodes[0];
1315         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1316
1317         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1318                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1319                                       struct btrfs_extent_data_ref);
1320                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1321         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1322                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1323                                       struct btrfs_shared_data_ref);
1324                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1325         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1326                 btrfs_print_v0_err(trans->fs_info);
1327                 btrfs_abort_transaction(trans, -EINVAL);
1328                 return -EINVAL;
1329         } else {
1330                 BUG();
1331         }
1332
1333         BUG_ON(num_refs < refs_to_drop);
1334         num_refs -= refs_to_drop;
1335
1336         if (num_refs == 0) {
1337                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1338                 *last_ref = 1;
1339         } else {
1340                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1341                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1342                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1343                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1344                 btrfs_mark_buffer_dirty(leaf);
1345         }
1346         return ret;
1347 }
1348
1349 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1350                                           struct btrfs_extent_inline_ref *iref)
1351 {
1352         struct btrfs_key key;
1353         struct extent_buffer *leaf;
1354         struct btrfs_extent_data_ref *ref1;
1355         struct btrfs_shared_data_ref *ref2;
1356         u32 num_refs = 0;
1357         int type;
1358
1359         leaf = path->nodes[0];
1360         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1361
1362         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1363         if (iref) {
1364                 /*
1365                  * If type is invalid, we should have bailed out earlier than
1366                  * this call.
1367                  */
1368                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1369                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1370                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1371                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1372                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1373                 } else {
1374                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1375                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1376                 }
1377         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1378                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1379                                       struct btrfs_extent_data_ref);
1380                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1381         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1382                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1383                                       struct btrfs_shared_data_ref);
1384                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1385         } else {
1386                 WARN_ON(1);
1387         }
1388         return num_refs;
1389 }
1390
1391 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1392                                           struct btrfs_path *path,
1393                                           u64 bytenr, u64 parent,
1394                                           u64 root_objectid)
1395 {
1396         struct btrfs_root *root = trans->fs_info->extent_root;
1397         struct btrfs_key key;
1398         int ret;
1399
1400         key.objectid = bytenr;
1401         if (parent) {
1402                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1403                 key.offset = parent;
1404         } else {
1405                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1406                 key.offset = root_objectid;
1407         }
1408
1409         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1410         if (ret > 0)
1411                 ret = -ENOENT;
1412         return ret;
1413 }
1414
1415 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1416                                           struct btrfs_path *path,
1417                                           u64 bytenr, u64 parent,
1418                                           u64 root_objectid)
1419 {
1420         struct btrfs_key key;
1421         int ret;
1422
1423         key.objectid = bytenr;
1424         if (parent) {
1425                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1426                 key.offset = parent;
1427         } else {
1428                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1429                 key.offset = root_objectid;
1430         }
1431
1432         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1433                                       path, &key, 0);
1434         btrfs_release_path(path);
1435         return ret;
1436 }
1437
1438 static inline int extent_ref_type(u64 parent, u64 owner)
1439 {
1440         int type;
1441         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1442                 if (parent > 0)
1443                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1444                 else
1445                         type = BTRFS_TREE_BLOCK_REF_KEY;
1446         } else {
1447                 if (parent > 0)
1448                         type = BTRFS_SHARED_DATA_REF_KEY;
1449                 else
1450                         type = BTRFS_EXTENT_DATA_REF_KEY;
1451         }
1452         return type;
1453 }
1454
1455 static int find_next_key(struct btrfs_path *path, int level,
1456                          struct btrfs_key *key)
1457
1458 {
1459         for (; level < BTRFS_MAX_LEVEL; level++) {
1460                 if (!path->nodes[level])
1461                         break;
1462                 if (path->slots[level] + 1 >=
1463                     btrfs_header_nritems(path->nodes[level]))
1464                         continue;
1465                 if (level == 0)
1466                         btrfs_item_key_to_cpu(path->nodes[level], key,
1467                                               path->slots[level] + 1);
1468                 else
1469                         btrfs_node_key_to_cpu(path->nodes[level], key,
1470                                               path->slots[level] + 1);
1471                 return 0;
1472         }
1473         return 1;
1474 }
1475
1476 /*
1477  * look for inline back ref. if back ref is found, *ref_ret is set
1478  * to the address of inline back ref, and 0 is returned.
1479  *
1480  * if back ref isn't found, *ref_ret is set to the address where it
1481  * should be inserted, and -ENOENT is returned.
1482  *
1483  * if insert is true and there are too many inline back refs, the path
1484  * points to the extent item, and -EAGAIN is returned.
1485  *
1486  * NOTE: inline back refs are ordered in the same way that back ref
1487  *       items in the tree are ordered.
1488  */
1489 static noinline_for_stack
1490 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1491                                  struct btrfs_path *path,
1492                                  struct btrfs_extent_inline_ref **ref_ret,
1493                                  u64 bytenr, u64 num_bytes,
1494                                  u64 parent, u64 root_objectid,
1495                                  u64 owner, u64 offset, int insert)
1496 {
1497         struct btrfs_fs_info *fs_info = trans->fs_info;
1498         struct btrfs_root *root = fs_info->extent_root;
1499         struct btrfs_key key;
1500         struct extent_buffer *leaf;
1501         struct btrfs_extent_item *ei;
1502         struct btrfs_extent_inline_ref *iref;
1503         u64 flags;
1504         u64 item_size;
1505         unsigned long ptr;
1506         unsigned long end;
1507         int extra_size;
1508         int type;
1509         int want;
1510         int ret;
1511         int err = 0;
1512         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1513         int needed;
1514
1515         key.objectid = bytenr;
1516         key.type = BTRFS_EXTENT_ITEM_KEY;
1517         key.offset = num_bytes;
1518
1519         want = extent_ref_type(parent, owner);
1520         if (insert) {
1521                 extra_size = btrfs_extent_inline_ref_size(want);
1522                 path->keep_locks = 1;
1523         } else
1524                 extra_size = -1;
1525
1526         /*
1527          * Owner is our level, so we can just add one to get the level for the
1528          * block we are interested in.
1529          */
1530         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1531                 key.type = BTRFS_METADATA_ITEM_KEY;
1532                 key.offset = owner;
1533         }
1534
1535 again:
1536         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1537         if (ret < 0) {
1538                 err = ret;
1539                 goto out;
1540         }
1541
1542         /*
1543          * We may be a newly converted file system which still has the old fat
1544          * extent entries for metadata, so try and see if we have one of those.
1545          */
1546         if (ret > 0 && skinny_metadata) {
1547                 skinny_metadata = false;
1548                 if (path->slots[0]) {
1549                         path->slots[0]--;
1550                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1551                                               path->slots[0]);
1552                         if (key.objectid == bytenr &&
1553                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1554                             key.offset == num_bytes)
1555                                 ret = 0;
1556                 }
1557                 if (ret) {
1558                         key.objectid = bytenr;
1559                         key.type = BTRFS_EXTENT_ITEM_KEY;
1560                         key.offset = num_bytes;
1561                         btrfs_release_path(path);
1562                         goto again;
1563                 }
1564         }
1565
1566         if (ret && !insert) {
1567                 err = -ENOENT;
1568                 goto out;
1569         } else if (WARN_ON(ret)) {
1570                 err = -EIO;
1571                 goto out;
1572         }
1573
1574         leaf = path->nodes[0];
1575         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1576         if (unlikely(item_size < sizeof(*ei))) {
1577                 err = -EINVAL;
1578                 btrfs_print_v0_err(fs_info);
1579                 btrfs_abort_transaction(trans, err);
1580                 goto out;
1581         }
1582
1583         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1584         flags = btrfs_extent_flags(leaf, ei);
1585
1586         ptr = (unsigned long)(ei + 1);
1587         end = (unsigned long)ei + item_size;
1588
1589         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1590                 ptr += sizeof(struct btrfs_tree_block_info);
1591                 BUG_ON(ptr > end);
1592         }
1593
1594         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1595                 needed = BTRFS_REF_TYPE_DATA;
1596         else
1597                 needed = BTRFS_REF_TYPE_BLOCK;
1598
1599         err = -ENOENT;
1600         while (1) {
1601                 if (ptr >= end) {
1602                         WARN_ON(ptr > end);
1603                         break;
1604                 }
1605                 iref = (struct btrfs_extent_inline_ref *)ptr;
1606                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1607                 if (type == BTRFS_REF_TYPE_INVALID) {
1608                         err = -EUCLEAN;
1609                         goto out;
1610                 }
1611
1612                 if (want < type)
1613                         break;
1614                 if (want > type) {
1615                         ptr += btrfs_extent_inline_ref_size(type);
1616                         continue;
1617                 }
1618
1619                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1620                         struct btrfs_extent_data_ref *dref;
1621                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1622                         if (match_extent_data_ref(leaf, dref, root_objectid,
1623                                                   owner, offset)) {
1624                                 err = 0;
1625                                 break;
1626                         }
1627                         if (hash_extent_data_ref_item(leaf, dref) <
1628                             hash_extent_data_ref(root_objectid, owner, offset))
1629                                 break;
1630                 } else {
1631                         u64 ref_offset;
1632                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1633                         if (parent > 0) {
1634                                 if (parent == ref_offset) {
1635                                         err = 0;
1636                                         break;
1637                                 }
1638                                 if (ref_offset < parent)
1639                                         break;
1640                         } else {
1641                                 if (root_objectid == ref_offset) {
1642                                         err = 0;
1643                                         break;
1644                                 }
1645                                 if (ref_offset < root_objectid)
1646                                         break;
1647                         }
1648                 }
1649                 ptr += btrfs_extent_inline_ref_size(type);
1650         }
1651         if (err == -ENOENT && insert) {
1652                 if (item_size + extra_size >=
1653                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1654                         err = -EAGAIN;
1655                         goto out;
1656                 }
1657                 /*
1658                  * To add new inline back ref, we have to make sure
1659                  * there is no corresponding back ref item.
1660                  * For simplicity, we just do not add new inline back
1661                  * ref if there is any kind of item for this block
1662                  */
1663                 if (find_next_key(path, 0, &key) == 0 &&
1664                     key.objectid == bytenr &&
1665                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1666                         err = -EAGAIN;
1667                         goto out;
1668                 }
1669         }
1670         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1671 out:
1672         if (insert) {
1673                 path->keep_locks = 0;
1674                 btrfs_unlock_up_safe(path, 1);
1675         }
1676         return err;
1677 }
1678
1679 /*
1680  * helper to add new inline back ref
1681  */
1682 static noinline_for_stack
1683 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1684                                  struct btrfs_path *path,
1685                                  struct btrfs_extent_inline_ref *iref,
1686                                  u64 parent, u64 root_objectid,
1687                                  u64 owner, u64 offset, int refs_to_add,
1688                                  struct btrfs_delayed_extent_op *extent_op)
1689 {
1690         struct extent_buffer *leaf;
1691         struct btrfs_extent_item *ei;
1692         unsigned long ptr;
1693         unsigned long end;
1694         unsigned long item_offset;
1695         u64 refs;
1696         int size;
1697         int type;
1698
1699         leaf = path->nodes[0];
1700         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1701         item_offset = (unsigned long)iref - (unsigned long)ei;
1702
1703         type = extent_ref_type(parent, owner);
1704         size = btrfs_extent_inline_ref_size(type);
1705
1706         btrfs_extend_item(fs_info, path, size);
1707
1708         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1709         refs = btrfs_extent_refs(leaf, ei);
1710         refs += refs_to_add;
1711         btrfs_set_extent_refs(leaf, ei, refs);
1712         if (extent_op)
1713                 __run_delayed_extent_op(extent_op, leaf, ei);
1714
1715         ptr = (unsigned long)ei + item_offset;
1716         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1717         if (ptr < end - size)
1718                 memmove_extent_buffer(leaf, ptr + size, ptr,
1719                                       end - size - ptr);
1720
1721         iref = (struct btrfs_extent_inline_ref *)ptr;
1722         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1723         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1724                 struct btrfs_extent_data_ref *dref;
1725                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1726                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1727                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1728                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1729                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1730         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1731                 struct btrfs_shared_data_ref *sref;
1732                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1733                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1734                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1735         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1736                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1737         } else {
1738                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1739         }
1740         btrfs_mark_buffer_dirty(leaf);
1741 }
1742
1743 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1744                                  struct btrfs_path *path,
1745                                  struct btrfs_extent_inline_ref **ref_ret,
1746                                  u64 bytenr, u64 num_bytes, u64 parent,
1747                                  u64 root_objectid, u64 owner, u64 offset)
1748 {
1749         int ret;
1750
1751         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1752                                            num_bytes, parent, root_objectid,
1753                                            owner, offset, 0);
1754         if (ret != -ENOENT)
1755                 return ret;
1756
1757         btrfs_release_path(path);
1758         *ref_ret = NULL;
1759
1760         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1761                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1762                                             root_objectid);
1763         } else {
1764                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1765                                              root_objectid, owner, offset);
1766         }
1767         return ret;
1768 }
1769
1770 /*
1771  * helper to update/remove inline back ref
1772  */
1773 static noinline_for_stack
1774 void update_inline_extent_backref(struct btrfs_path *path,
1775                                   struct btrfs_extent_inline_ref *iref,
1776                                   int refs_to_mod,
1777                                   struct btrfs_delayed_extent_op *extent_op,
1778                                   int *last_ref)
1779 {
1780         struct extent_buffer *leaf = path->nodes[0];
1781         struct btrfs_fs_info *fs_info = leaf->fs_info;
1782         struct btrfs_extent_item *ei;
1783         struct btrfs_extent_data_ref *dref = NULL;
1784         struct btrfs_shared_data_ref *sref = NULL;
1785         unsigned long ptr;
1786         unsigned long end;
1787         u32 item_size;
1788         int size;
1789         int type;
1790         u64 refs;
1791
1792         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1793         refs = btrfs_extent_refs(leaf, ei);
1794         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1795         refs += refs_to_mod;
1796         btrfs_set_extent_refs(leaf, ei, refs);
1797         if (extent_op)
1798                 __run_delayed_extent_op(extent_op, leaf, ei);
1799
1800         /*
1801          * If type is invalid, we should have bailed out after
1802          * lookup_inline_extent_backref().
1803          */
1804         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1805         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1806
1807         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1808                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1809                 refs = btrfs_extent_data_ref_count(leaf, dref);
1810         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1811                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1812                 refs = btrfs_shared_data_ref_count(leaf, sref);
1813         } else {
1814                 refs = 1;
1815                 BUG_ON(refs_to_mod != -1);
1816         }
1817
1818         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1819         refs += refs_to_mod;
1820
1821         if (refs > 0) {
1822                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1823                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1824                 else
1825                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1826         } else {
1827                 *last_ref = 1;
1828                 size =  btrfs_extent_inline_ref_size(type);
1829                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1830                 ptr = (unsigned long)iref;
1831                 end = (unsigned long)ei + item_size;
1832                 if (ptr + size < end)
1833                         memmove_extent_buffer(leaf, ptr, ptr + size,
1834                                               end - ptr - size);
1835                 item_size -= size;
1836                 btrfs_truncate_item(fs_info, path, item_size, 1);
1837         }
1838         btrfs_mark_buffer_dirty(leaf);
1839 }
1840
1841 static noinline_for_stack
1842 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1843                                  struct btrfs_path *path,
1844                                  u64 bytenr, u64 num_bytes, u64 parent,
1845                                  u64 root_objectid, u64 owner,
1846                                  u64 offset, int refs_to_add,
1847                                  struct btrfs_delayed_extent_op *extent_op)
1848 {
1849         struct btrfs_extent_inline_ref *iref;
1850         int ret;
1851
1852         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1853                                            num_bytes, parent, root_objectid,
1854                                            owner, offset, 1);
1855         if (ret == 0) {
1856                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1857                 update_inline_extent_backref(path, iref, refs_to_add,
1858                                              extent_op, NULL);
1859         } else if (ret == -ENOENT) {
1860                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1861                                             root_objectid, owner, offset,
1862                                             refs_to_add, extent_op);
1863                 ret = 0;
1864         }
1865         return ret;
1866 }
1867
1868 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1869                                  struct btrfs_path *path,
1870                                  u64 bytenr, u64 parent, u64 root_objectid,
1871                                  u64 owner, u64 offset, int refs_to_add)
1872 {
1873         int ret;
1874         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1875                 BUG_ON(refs_to_add != 1);
1876                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1877                                             root_objectid);
1878         } else {
1879                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1880                                              root_objectid, owner, offset,
1881                                              refs_to_add);
1882         }
1883         return ret;
1884 }
1885
1886 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1887                                  struct btrfs_path *path,
1888                                  struct btrfs_extent_inline_ref *iref,
1889                                  int refs_to_drop, int is_data, int *last_ref)
1890 {
1891         int ret = 0;
1892
1893         BUG_ON(!is_data && refs_to_drop != 1);
1894         if (iref) {
1895                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1896                                              last_ref);
1897         } else if (is_data) {
1898                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1899                                              last_ref);
1900         } else {
1901                 *last_ref = 1;
1902                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1903         }
1904         return ret;
1905 }
1906
1907 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1908 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1909                                u64 *discarded_bytes)
1910 {
1911         int j, ret = 0;
1912         u64 bytes_left, end;
1913         u64 aligned_start = ALIGN(start, 1 << 9);
1914
1915         if (WARN_ON(start != aligned_start)) {
1916                 len -= aligned_start - start;
1917                 len = round_down(len, 1 << 9);
1918                 start = aligned_start;
1919         }
1920
1921         *discarded_bytes = 0;
1922
1923         if (!len)
1924                 return 0;
1925
1926         end = start + len;
1927         bytes_left = len;
1928
1929         /* Skip any superblocks on this device. */
1930         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1931                 u64 sb_start = btrfs_sb_offset(j);
1932                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1933                 u64 size = sb_start - start;
1934
1935                 if (!in_range(sb_start, start, bytes_left) &&
1936                     !in_range(sb_end, start, bytes_left) &&
1937                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1938                         continue;
1939
1940                 /*
1941                  * Superblock spans beginning of range.  Adjust start and
1942                  * try again.
1943                  */
1944                 if (sb_start <= start) {
1945                         start += sb_end - start;
1946                         if (start > end) {
1947                                 bytes_left = 0;
1948                                 break;
1949                         }
1950                         bytes_left = end - start;
1951                         continue;
1952                 }
1953
1954                 if (size) {
1955                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1956                                                    GFP_NOFS, 0);
1957                         if (!ret)
1958                                 *discarded_bytes += size;
1959                         else if (ret != -EOPNOTSUPP)
1960                                 return ret;
1961                 }
1962
1963                 start = sb_end;
1964                 if (start > end) {
1965                         bytes_left = 0;
1966                         break;
1967                 }
1968                 bytes_left = end - start;
1969         }
1970
1971         if (bytes_left) {
1972                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1973                                            GFP_NOFS, 0);
1974                 if (!ret)
1975                         *discarded_bytes += bytes_left;
1976         }
1977         return ret;
1978 }
1979
1980 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1981                          u64 num_bytes, u64 *actual_bytes)
1982 {
1983         int ret;
1984         u64 discarded_bytes = 0;
1985         struct btrfs_bio *bbio = NULL;
1986
1987
1988         /*
1989          * Avoid races with device replace and make sure our bbio has devices
1990          * associated to its stripes that don't go away while we are discarding.
1991          */
1992         btrfs_bio_counter_inc_blocked(fs_info);
1993         /* Tell the block device(s) that the sectors can be discarded */
1994         ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1995                               &bbio, 0);
1996         /* Error condition is -ENOMEM */
1997         if (!ret) {
1998                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1999                 int i;
2000
2001
2002                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2003                         u64 bytes;
2004                         struct request_queue *req_q;
2005
2006                         if (!stripe->dev->bdev) {
2007                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
2008                                 continue;
2009                         }
2010                         req_q = bdev_get_queue(stripe->dev->bdev);
2011                         if (!blk_queue_discard(req_q))
2012                                 continue;
2013
2014                         ret = btrfs_issue_discard(stripe->dev->bdev,
2015                                                   stripe->physical,
2016                                                   stripe->length,
2017                                                   &bytes);
2018                         if (!ret)
2019                                 discarded_bytes += bytes;
2020                         else if (ret != -EOPNOTSUPP)
2021                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2022
2023                         /*
2024                          * Just in case we get back EOPNOTSUPP for some reason,
2025                          * just ignore the return value so we don't screw up
2026                          * people calling discard_extent.
2027                          */
2028                         ret = 0;
2029                 }
2030                 btrfs_put_bbio(bbio);
2031         }
2032         btrfs_bio_counter_dec(fs_info);
2033
2034         if (actual_bytes)
2035                 *actual_bytes = discarded_bytes;
2036
2037
2038         if (ret == -EOPNOTSUPP)
2039                 ret = 0;
2040         return ret;
2041 }
2042
2043 /* Can return -ENOMEM */
2044 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2045                          struct btrfs_root *root,
2046                          u64 bytenr, u64 num_bytes, u64 parent,
2047                          u64 root_objectid, u64 owner, u64 offset)
2048 {
2049         struct btrfs_fs_info *fs_info = root->fs_info;
2050         int old_ref_mod, new_ref_mod;
2051         int ret;
2052
2053         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2054                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2055
2056         btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2057                            owner, offset, BTRFS_ADD_DELAYED_REF);
2058
2059         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2060                 ret = btrfs_add_delayed_tree_ref(trans, bytenr,
2061                                                  num_bytes, parent,
2062                                                  root_objectid, (int)owner,
2063                                                  BTRFS_ADD_DELAYED_REF, NULL,
2064                                                  &old_ref_mod, &new_ref_mod);
2065         } else {
2066                 ret = btrfs_add_delayed_data_ref(trans, bytenr,
2067                                                  num_bytes, parent,
2068                                                  root_objectid, owner, offset,
2069                                                  0, BTRFS_ADD_DELAYED_REF,
2070                                                  &old_ref_mod, &new_ref_mod);
2071         }
2072
2073         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2074                 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2075
2076                 add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2077         }
2078
2079         return ret;
2080 }
2081
2082 /*
2083  * __btrfs_inc_extent_ref - insert backreference for a given extent
2084  *
2085  * @trans:          Handle of transaction
2086  *
2087  * @node:           The delayed ref node used to get the bytenr/length for
2088  *                  extent whose references are incremented.
2089  *
2090  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2091  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2092  *                  bytenr of the parent block. Since new extents are always
2093  *                  created with indirect references, this will only be the case
2094  *                  when relocating a shared extent. In that case, root_objectid
2095  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2096  *                  be 0
2097  *
2098  * @root_objectid:  The id of the root where this modification has originated,
2099  *                  this can be either one of the well-known metadata trees or
2100  *                  the subvolume id which references this extent.
2101  *
2102  * @owner:          For data extents it is the inode number of the owning file.
2103  *                  For metadata extents this parameter holds the level in the
2104  *                  tree of the extent.
2105  *
2106  * @offset:         For metadata extents the offset is ignored and is currently
2107  *                  always passed as 0. For data extents it is the fileoffset
2108  *                  this extent belongs to.
2109  *
2110  * @refs_to_add     Number of references to add
2111  *
2112  * @extent_op       Pointer to a structure, holding information necessary when
2113  *                  updating a tree block's flags
2114  *
2115  */
2116 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2117                                   struct btrfs_delayed_ref_node *node,
2118                                   u64 parent, u64 root_objectid,
2119                                   u64 owner, u64 offset, int refs_to_add,
2120                                   struct btrfs_delayed_extent_op *extent_op)
2121 {
2122         struct btrfs_path *path;
2123         struct extent_buffer *leaf;
2124         struct btrfs_extent_item *item;
2125         struct btrfs_key key;
2126         u64 bytenr = node->bytenr;
2127         u64 num_bytes = node->num_bytes;
2128         u64 refs;
2129         int ret;
2130
2131         path = btrfs_alloc_path();
2132         if (!path)
2133                 return -ENOMEM;
2134
2135         path->reada = READA_FORWARD;
2136         path->leave_spinning = 1;
2137         /* this will setup the path even if it fails to insert the back ref */
2138         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2139                                            parent, root_objectid, owner,
2140                                            offset, refs_to_add, extent_op);
2141         if ((ret < 0 && ret != -EAGAIN) || !ret)
2142                 goto out;
2143
2144         /*
2145          * Ok we had -EAGAIN which means we didn't have space to insert and
2146          * inline extent ref, so just update the reference count and add a
2147          * normal backref.
2148          */
2149         leaf = path->nodes[0];
2150         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2151         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2152         refs = btrfs_extent_refs(leaf, item);
2153         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2154         if (extent_op)
2155                 __run_delayed_extent_op(extent_op, leaf, item);
2156
2157         btrfs_mark_buffer_dirty(leaf);
2158         btrfs_release_path(path);
2159
2160         path->reada = READA_FORWARD;
2161         path->leave_spinning = 1;
2162         /* now insert the actual backref */
2163         ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2164                                     owner, offset, refs_to_add);
2165         if (ret)
2166                 btrfs_abort_transaction(trans, ret);
2167 out:
2168         btrfs_free_path(path);
2169         return ret;
2170 }
2171
2172 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2173                                 struct btrfs_delayed_ref_node *node,
2174                                 struct btrfs_delayed_extent_op *extent_op,
2175                                 int insert_reserved)
2176 {
2177         int ret = 0;
2178         struct btrfs_delayed_data_ref *ref;
2179         struct btrfs_key ins;
2180         u64 parent = 0;
2181         u64 ref_root = 0;
2182         u64 flags = 0;
2183
2184         ins.objectid = node->bytenr;
2185         ins.offset = node->num_bytes;
2186         ins.type = BTRFS_EXTENT_ITEM_KEY;
2187
2188         ref = btrfs_delayed_node_to_data_ref(node);
2189         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2190
2191         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2192                 parent = ref->parent;
2193         ref_root = ref->root;
2194
2195         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2196                 if (extent_op)
2197                         flags |= extent_op->flags_to_set;
2198                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2199                                                  flags, ref->objectid,
2200                                                  ref->offset, &ins,
2201                                                  node->ref_mod);
2202         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2203                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2204                                              ref->objectid, ref->offset,
2205                                              node->ref_mod, extent_op);
2206         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2207                 ret = __btrfs_free_extent(trans, node, parent,
2208                                           ref_root, ref->objectid,
2209                                           ref->offset, node->ref_mod,
2210                                           extent_op);
2211         } else {
2212                 BUG();
2213         }
2214         return ret;
2215 }
2216
2217 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2218                                     struct extent_buffer *leaf,
2219                                     struct btrfs_extent_item *ei)
2220 {
2221         u64 flags = btrfs_extent_flags(leaf, ei);
2222         if (extent_op->update_flags) {
2223                 flags |= extent_op->flags_to_set;
2224                 btrfs_set_extent_flags(leaf, ei, flags);
2225         }
2226
2227         if (extent_op->update_key) {
2228                 struct btrfs_tree_block_info *bi;
2229                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2230                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2231                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2232         }
2233 }
2234
2235 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2236                                  struct btrfs_delayed_ref_head *head,
2237                                  struct btrfs_delayed_extent_op *extent_op)
2238 {
2239         struct btrfs_fs_info *fs_info = trans->fs_info;
2240         struct btrfs_key key;
2241         struct btrfs_path *path;
2242         struct btrfs_extent_item *ei;
2243         struct extent_buffer *leaf;
2244         u32 item_size;
2245         int ret;
2246         int err = 0;
2247         int metadata = !extent_op->is_data;
2248
2249         if (trans->aborted)
2250                 return 0;
2251
2252         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2253                 metadata = 0;
2254
2255         path = btrfs_alloc_path();
2256         if (!path)
2257                 return -ENOMEM;
2258
2259         key.objectid = head->bytenr;
2260
2261         if (metadata) {
2262                 key.type = BTRFS_METADATA_ITEM_KEY;
2263                 key.offset = extent_op->level;
2264         } else {
2265                 key.type = BTRFS_EXTENT_ITEM_KEY;
2266                 key.offset = head->num_bytes;
2267         }
2268
2269 again:
2270         path->reada = READA_FORWARD;
2271         path->leave_spinning = 1;
2272         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2273         if (ret < 0) {
2274                 err = ret;
2275                 goto out;
2276         }
2277         if (ret > 0) {
2278                 if (metadata) {
2279                         if (path->slots[0] > 0) {
2280                                 path->slots[0]--;
2281                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2282                                                       path->slots[0]);
2283                                 if (key.objectid == head->bytenr &&
2284                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2285                                     key.offset == head->num_bytes)
2286                                         ret = 0;
2287                         }
2288                         if (ret > 0) {
2289                                 btrfs_release_path(path);
2290                                 metadata = 0;
2291
2292                                 key.objectid = head->bytenr;
2293                                 key.offset = head->num_bytes;
2294                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2295                                 goto again;
2296                         }
2297                 } else {
2298                         err = -EIO;
2299                         goto out;
2300                 }
2301         }
2302
2303         leaf = path->nodes[0];
2304         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2305
2306         if (unlikely(item_size < sizeof(*ei))) {
2307                 err = -EINVAL;
2308                 btrfs_print_v0_err(fs_info);
2309                 btrfs_abort_transaction(trans, err);
2310                 goto out;
2311         }
2312
2313         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2314         __run_delayed_extent_op(extent_op, leaf, ei);
2315
2316         btrfs_mark_buffer_dirty(leaf);
2317 out:
2318         btrfs_free_path(path);
2319         return err;
2320 }
2321
2322 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2323                                 struct btrfs_delayed_ref_node *node,
2324                                 struct btrfs_delayed_extent_op *extent_op,
2325                                 int insert_reserved)
2326 {
2327         int ret = 0;
2328         struct btrfs_delayed_tree_ref *ref;
2329         u64 parent = 0;
2330         u64 ref_root = 0;
2331
2332         ref = btrfs_delayed_node_to_tree_ref(node);
2333         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2334
2335         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2336                 parent = ref->parent;
2337         ref_root = ref->root;
2338
2339         if (node->ref_mod != 1) {
2340                 btrfs_err(trans->fs_info,
2341         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2342                           node->bytenr, node->ref_mod, node->action, ref_root,
2343                           parent);
2344                 return -EIO;
2345         }
2346         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2347                 BUG_ON(!extent_op || !extent_op->update_flags);
2348                 ret = alloc_reserved_tree_block(trans, node, extent_op);
2349         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2350                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2351                                              ref->level, 0, 1, extent_op);
2352         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2353                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
2354                                           ref->level, 0, 1, extent_op);
2355         } else {
2356                 BUG();
2357         }
2358         return ret;
2359 }
2360
2361 /* helper function to actually process a single delayed ref entry */
2362 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2363                                struct btrfs_delayed_ref_node *node,
2364                                struct btrfs_delayed_extent_op *extent_op,
2365                                int insert_reserved)
2366 {
2367         int ret = 0;
2368
2369         if (trans->aborted) {
2370                 if (insert_reserved)
2371                         btrfs_pin_extent(trans->fs_info, node->bytenr,
2372                                          node->num_bytes, 1);
2373                 return 0;
2374         }
2375
2376         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2377             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2378                 ret = run_delayed_tree_ref(trans, node, extent_op,
2379                                            insert_reserved);
2380         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2381                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2382                 ret = run_delayed_data_ref(trans, node, extent_op,
2383                                            insert_reserved);
2384         else
2385                 BUG();
2386         if (ret && insert_reserved)
2387                 btrfs_pin_extent(trans->fs_info, node->bytenr,
2388                                  node->num_bytes, 1);
2389         return ret;
2390 }
2391
2392 static inline struct btrfs_delayed_ref_node *
2393 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2394 {
2395         struct btrfs_delayed_ref_node *ref;
2396
2397         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2398                 return NULL;
2399
2400         /*
2401          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2402          * This is to prevent a ref count from going down to zero, which deletes
2403          * the extent item from the extent tree, when there still are references
2404          * to add, which would fail because they would not find the extent item.
2405          */
2406         if (!list_empty(&head->ref_add_list))
2407                 return list_first_entry(&head->ref_add_list,
2408                                 struct btrfs_delayed_ref_node, add_list);
2409
2410         ref = rb_entry(rb_first_cached(&head->ref_tree),
2411                        struct btrfs_delayed_ref_node, ref_node);
2412         ASSERT(list_empty(&ref->add_list));
2413         return ref;
2414 }
2415
2416 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2417                                       struct btrfs_delayed_ref_head *head)
2418 {
2419         spin_lock(&delayed_refs->lock);
2420         head->processing = 0;
2421         delayed_refs->num_heads_ready++;
2422         spin_unlock(&delayed_refs->lock);
2423         btrfs_delayed_ref_unlock(head);
2424 }
2425
2426 static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2427                              struct btrfs_delayed_ref_head *head)
2428 {
2429         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2430         int ret;
2431
2432         if (!extent_op)
2433                 return 0;
2434         head->extent_op = NULL;
2435         if (head->must_insert_reserved) {
2436                 btrfs_free_delayed_extent_op(extent_op);
2437                 return 0;
2438         }
2439         spin_unlock(&head->lock);
2440         ret = run_delayed_extent_op(trans, head, extent_op);
2441         btrfs_free_delayed_extent_op(extent_op);
2442         return ret ? ret : 1;
2443 }
2444
2445 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2446                             struct btrfs_delayed_ref_head *head)
2447 {
2448
2449         struct btrfs_fs_info *fs_info = trans->fs_info;
2450         struct btrfs_delayed_ref_root *delayed_refs;
2451         int ret;
2452
2453         delayed_refs = &trans->transaction->delayed_refs;
2454
2455         ret = cleanup_extent_op(trans, head);
2456         if (ret < 0) {
2457                 unselect_delayed_ref_head(delayed_refs, head);
2458                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2459                 return ret;
2460         } else if (ret) {
2461                 return ret;
2462         }
2463
2464         /*
2465          * Need to drop our head ref lock and re-acquire the delayed ref lock
2466          * and then re-check to make sure nobody got added.
2467          */
2468         spin_unlock(&head->lock);
2469         spin_lock(&delayed_refs->lock);
2470         spin_lock(&head->lock);
2471         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2472                 spin_unlock(&head->lock);
2473                 spin_unlock(&delayed_refs->lock);
2474                 return 1;
2475         }
2476         delayed_refs->num_heads--;
2477         rb_erase_cached(&head->href_node, &delayed_refs->href_root);
2478         RB_CLEAR_NODE(&head->href_node);
2479         spin_unlock(&head->lock);
2480         spin_unlock(&delayed_refs->lock);
2481         atomic_dec(&delayed_refs->num_entries);
2482
2483         trace_run_delayed_ref_head(fs_info, head, 0);
2484
2485         if (head->total_ref_mod < 0) {
2486                 struct btrfs_space_info *space_info;
2487                 u64 flags;
2488
2489                 if (head->is_data)
2490                         flags = BTRFS_BLOCK_GROUP_DATA;
2491                 else if (head->is_system)
2492                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
2493                 else
2494                         flags = BTRFS_BLOCK_GROUP_METADATA;
2495                 space_info = __find_space_info(fs_info, flags);
2496                 ASSERT(space_info);
2497                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2498                                    -head->num_bytes,
2499                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
2500
2501                 if (head->is_data) {
2502                         spin_lock(&delayed_refs->lock);
2503                         delayed_refs->pending_csums -= head->num_bytes;
2504                         spin_unlock(&delayed_refs->lock);
2505                 }
2506         }
2507
2508         if (head->must_insert_reserved) {
2509                 btrfs_pin_extent(fs_info, head->bytenr,
2510                                  head->num_bytes, 1);
2511                 if (head->is_data) {
2512                         ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2513                                               head->num_bytes);
2514                 }
2515         }
2516
2517         /* Also free its reserved qgroup space */
2518         btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2519                                       head->qgroup_reserved);
2520         btrfs_delayed_ref_unlock(head);
2521         btrfs_put_delayed_ref_head(head);
2522         return 0;
2523 }
2524
2525 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2526                                         struct btrfs_trans_handle *trans)
2527 {
2528         struct btrfs_delayed_ref_root *delayed_refs =
2529                 &trans->transaction->delayed_refs;
2530         struct btrfs_delayed_ref_head *head = NULL;
2531         int ret;
2532
2533         spin_lock(&delayed_refs->lock);
2534         head = btrfs_select_ref_head(delayed_refs);
2535         if (!head) {
2536                 spin_unlock(&delayed_refs->lock);
2537                 return head;
2538         }
2539
2540         /*
2541          * Grab the lock that says we are going to process all the refs for
2542          * this head
2543          */
2544         ret = btrfs_delayed_ref_lock(delayed_refs, head);
2545         spin_unlock(&delayed_refs->lock);
2546
2547         /*
2548          * We may have dropped the spin lock to get the head mutex lock, and
2549          * that might have given someone else time to free the head.  If that's
2550          * true, it has been removed from our list and we can move on.
2551          */
2552         if (ret == -EAGAIN)
2553                 head = ERR_PTR(-EAGAIN);
2554
2555         return head;
2556 }
2557
2558 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2559                                     struct btrfs_delayed_ref_head *locked_ref,
2560                                     unsigned long *run_refs)
2561 {
2562         struct btrfs_fs_info *fs_info = trans->fs_info;
2563         struct btrfs_delayed_ref_root *delayed_refs;
2564         struct btrfs_delayed_extent_op *extent_op;
2565         struct btrfs_delayed_ref_node *ref;
2566         int must_insert_reserved = 0;
2567         int ret;
2568
2569         delayed_refs = &trans->transaction->delayed_refs;
2570
2571         lockdep_assert_held(&locked_ref->mutex);
2572         lockdep_assert_held(&locked_ref->lock);
2573
2574         while ((ref = select_delayed_ref(locked_ref))) {
2575                 if (ref->seq &&
2576                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
2577                         spin_unlock(&locked_ref->lock);
2578                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2579                         return -EAGAIN;
2580                 }
2581
2582                 (*run_refs)++;
2583                 ref->in_tree = 0;
2584                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2585                 RB_CLEAR_NODE(&ref->ref_node);
2586                 if (!list_empty(&ref->add_list))
2587                         list_del(&ref->add_list);
2588                 /*
2589                  * When we play the delayed ref, also correct the ref_mod on
2590                  * head
2591                  */
2592                 switch (ref->action) {
2593                 case BTRFS_ADD_DELAYED_REF:
2594                 case BTRFS_ADD_DELAYED_EXTENT:
2595                         locked_ref->ref_mod -= ref->ref_mod;
2596                         break;
2597                 case BTRFS_DROP_DELAYED_REF:
2598                         locked_ref->ref_mod += ref->ref_mod;
2599                         break;
2600                 default:
2601                         WARN_ON(1);
2602                 }
2603                 atomic_dec(&delayed_refs->num_entries);
2604
2605                 /*
2606                  * Record the must_insert_reserved flag before we drop the
2607                  * spin lock.
2608                  */
2609                 must_insert_reserved = locked_ref->must_insert_reserved;
2610                 locked_ref->must_insert_reserved = 0;
2611
2612                 extent_op = locked_ref->extent_op;
2613                 locked_ref->extent_op = NULL;
2614                 spin_unlock(&locked_ref->lock);
2615
2616                 ret = run_one_delayed_ref(trans, ref, extent_op,
2617                                           must_insert_reserved);
2618
2619                 btrfs_free_delayed_extent_op(extent_op);
2620                 if (ret) {
2621                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2622                         btrfs_put_delayed_ref(ref);
2623                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2624                                     ret);
2625                         return ret;
2626                 }
2627
2628                 btrfs_put_delayed_ref(ref);
2629                 cond_resched();
2630
2631                 spin_lock(&locked_ref->lock);
2632                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2633         }
2634
2635         return 0;
2636 }
2637
2638 /*
2639  * Returns 0 on success or if called with an already aborted transaction.
2640  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2641  */
2642 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2643                                              unsigned long nr)
2644 {
2645         struct btrfs_fs_info *fs_info = trans->fs_info;
2646         struct btrfs_delayed_ref_root *delayed_refs;
2647         struct btrfs_delayed_ref_head *locked_ref = NULL;
2648         ktime_t start = ktime_get();
2649         int ret;
2650         unsigned long count = 0;
2651         unsigned long actual_count = 0;
2652
2653         delayed_refs = &trans->transaction->delayed_refs;
2654         do {
2655                 if (!locked_ref) {
2656                         locked_ref = btrfs_obtain_ref_head(trans);
2657                         if (IS_ERR_OR_NULL(locked_ref)) {
2658                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2659                                         continue;
2660                                 } else {
2661                                         break;
2662                                 }
2663                         }
2664                         count++;
2665                 }
2666                 /*
2667                  * We need to try and merge add/drops of the same ref since we
2668                  * can run into issues with relocate dropping the implicit ref
2669                  * and then it being added back again before the drop can
2670                  * finish.  If we merged anything we need to re-loop so we can
2671                  * get a good ref.
2672                  * Or we can get node references of the same type that weren't
2673                  * merged when created due to bumps in the tree mod seq, and
2674                  * we need to merge them to prevent adding an inline extent
2675                  * backref before dropping it (triggering a BUG_ON at
2676                  * insert_inline_extent_backref()).
2677                  */
2678                 spin_lock(&locked_ref->lock);
2679                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2680
2681                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2682                                                       &actual_count);
2683                 if (ret < 0 && ret != -EAGAIN) {
2684                         /*
2685                          * Error, btrfs_run_delayed_refs_for_head already
2686                          * unlocked everything so just bail out
2687                          */
2688                         return ret;
2689                 } else if (!ret) {
2690                         /*
2691                          * Success, perform the usual cleanup of a processed
2692                          * head
2693                          */
2694                         ret = cleanup_ref_head(trans, locked_ref);
2695                         if (ret > 0 ) {
2696                                 /* We dropped our lock, we need to loop. */
2697                                 ret = 0;
2698                                 continue;
2699                         } else if (ret) {
2700                                 return ret;
2701                         }
2702                 }
2703
2704                 /*
2705                  * Either success case or btrfs_run_delayed_refs_for_head
2706                  * returned -EAGAIN, meaning we need to select another head
2707                  */
2708
2709                 locked_ref = NULL;
2710                 cond_resched();
2711         } while ((nr != -1 && count < nr) || locked_ref);
2712
2713         /*
2714          * We don't want to include ref heads since we can have empty ref heads
2715          * and those will drastically skew our runtime down since we just do
2716          * accounting, no actual extent tree updates.
2717          */
2718         if (actual_count > 0) {
2719                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2720                 u64 avg;
2721
2722                 /*
2723                  * We weigh the current average higher than our current runtime
2724                  * to avoid large swings in the average.
2725                  */
2726                 spin_lock(&delayed_refs->lock);
2727                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2728                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2729                 spin_unlock(&delayed_refs->lock);
2730         }
2731         return 0;
2732 }
2733
2734 #ifdef SCRAMBLE_DELAYED_REFS
2735 /*
2736  * Normally delayed refs get processed in ascending bytenr order. This
2737  * correlates in most cases to the order added. To expose dependencies on this
2738  * order, we start to process the tree in the middle instead of the beginning
2739  */
2740 static u64 find_middle(struct rb_root *root)
2741 {
2742         struct rb_node *n = root->rb_node;
2743         struct btrfs_delayed_ref_node *entry;
2744         int alt = 1;
2745         u64 middle;
2746         u64 first = 0, last = 0;
2747
2748         n = rb_first(root);
2749         if (n) {
2750                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2751                 first = entry->bytenr;
2752         }
2753         n = rb_last(root);
2754         if (n) {
2755                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2756                 last = entry->bytenr;
2757         }
2758         n = root->rb_node;
2759
2760         while (n) {
2761                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2762                 WARN_ON(!entry->in_tree);
2763
2764                 middle = entry->bytenr;
2765
2766                 if (alt)
2767                         n = n->rb_left;
2768                 else
2769                         n = n->rb_right;
2770
2771                 alt = 1 - alt;
2772         }
2773         return middle;
2774 }
2775 #endif
2776
2777 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2778 {
2779         u64 num_bytes;
2780
2781         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2782                              sizeof(struct btrfs_extent_inline_ref));
2783         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2784                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2785
2786         /*
2787          * We don't ever fill up leaves all the way so multiply by 2 just to be
2788          * closer to what we're really going to want to use.
2789          */
2790         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2791 }
2792
2793 /*
2794  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2795  * would require to store the csums for that many bytes.
2796  */
2797 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2798 {
2799         u64 csum_size;
2800         u64 num_csums_per_leaf;
2801         u64 num_csums;
2802
2803         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2804         num_csums_per_leaf = div64_u64(csum_size,
2805                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2806         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2807         num_csums += num_csums_per_leaf - 1;
2808         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2809         return num_csums;
2810 }
2811
2812 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
2813 {
2814         struct btrfs_fs_info *fs_info = trans->fs_info;
2815         struct btrfs_block_rsv *global_rsv;
2816         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2817         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2818         unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
2819         u64 num_bytes, num_dirty_bgs_bytes;
2820         int ret = 0;
2821
2822         num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2823         num_heads = heads_to_leaves(fs_info, num_heads);
2824         if (num_heads > 1)
2825                 num_bytes += (num_heads - 1) * fs_info->nodesize;
2826         num_bytes <<= 1;
2827         num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2828                                                         fs_info->nodesize;
2829         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2830                                                              num_dirty_bgs);
2831         global_rsv = &fs_info->global_block_rsv;
2832
2833         /*
2834          * If we can't allocate any more chunks lets make sure we have _lots_ of
2835          * wiggle room since running delayed refs can create more delayed refs.
2836          */
2837         if (global_rsv->space_info->full) {
2838                 num_dirty_bgs_bytes <<= 1;
2839                 num_bytes <<= 1;
2840         }
2841
2842         spin_lock(&global_rsv->lock);
2843         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2844                 ret = 1;
2845         spin_unlock(&global_rsv->lock);
2846         return ret;
2847 }
2848
2849 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2850 {
2851         u64 num_entries =
2852                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2853         u64 avg_runtime;
2854         u64 val;
2855
2856         smp_mb();
2857         avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2858         val = num_entries * avg_runtime;
2859         if (val >= NSEC_PER_SEC)
2860                 return 1;
2861         if (val >= NSEC_PER_SEC / 2)
2862                 return 2;
2863
2864         return btrfs_check_space_for_delayed_refs(trans);
2865 }
2866
2867 struct async_delayed_refs {
2868         struct btrfs_root *root;
2869         u64 transid;
2870         int count;
2871         int error;
2872         int sync;
2873         struct completion wait;
2874         struct btrfs_work work;
2875 };
2876
2877 static inline struct async_delayed_refs *
2878 to_async_delayed_refs(struct btrfs_work *work)
2879 {
2880         return container_of(work, struct async_delayed_refs, work);
2881 }
2882
2883 static void delayed_ref_async_start(struct btrfs_work *work)
2884 {
2885         struct async_delayed_refs *async = to_async_delayed_refs(work);
2886         struct btrfs_trans_handle *trans;
2887         struct btrfs_fs_info *fs_info = async->root->fs_info;
2888         int ret;
2889
2890         /* if the commit is already started, we don't need to wait here */
2891         if (btrfs_transaction_blocked(fs_info))
2892                 goto done;
2893
2894         trans = btrfs_join_transaction(async->root);
2895         if (IS_ERR(trans)) {
2896                 async->error = PTR_ERR(trans);
2897                 goto done;
2898         }
2899
2900         /*
2901          * trans->sync means that when we call end_transaction, we won't
2902          * wait on delayed refs
2903          */
2904         trans->sync = true;
2905
2906         /* Don't bother flushing if we got into a different transaction */
2907         if (trans->transid > async->transid)
2908                 goto end;
2909
2910         ret = btrfs_run_delayed_refs(trans, async->count);
2911         if (ret)
2912                 async->error = ret;
2913 end:
2914         ret = btrfs_end_transaction(trans);
2915         if (ret && !async->error)
2916                 async->error = ret;
2917 done:
2918         if (async->sync)
2919                 complete(&async->wait);
2920         else
2921                 kfree(async);
2922 }
2923
2924 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2925                                  unsigned long count, u64 transid, int wait)
2926 {
2927         struct async_delayed_refs *async;
2928         int ret;
2929
2930         async = kmalloc(sizeof(*async), GFP_NOFS);
2931         if (!async)
2932                 return -ENOMEM;
2933
2934         async->root = fs_info->tree_root;
2935         async->count = count;
2936         async->error = 0;
2937         async->transid = transid;
2938         if (wait)
2939                 async->sync = 1;
2940         else
2941                 async->sync = 0;
2942         init_completion(&async->wait);
2943
2944         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2945                         delayed_ref_async_start, NULL, NULL);
2946
2947         btrfs_queue_work(fs_info->extent_workers, &async->work);
2948
2949         if (wait) {
2950                 wait_for_completion(&async->wait);
2951                 ret = async->error;
2952                 kfree(async);
2953                 return ret;
2954         }
2955         return 0;
2956 }
2957
2958 /*
2959  * this starts processing the delayed reference count updates and
2960  * extent insertions we have queued up so far.  count can be
2961  * 0, which means to process everything in the tree at the start
2962  * of the run (but not newly added entries), or it can be some target
2963  * number you'd like to process.
2964  *
2965  * Returns 0 on success or if called with an aborted transaction
2966  * Returns <0 on error and aborts the transaction
2967  */
2968 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2969                            unsigned long count)
2970 {
2971         struct btrfs_fs_info *fs_info = trans->fs_info;
2972         struct rb_node *node;
2973         struct btrfs_delayed_ref_root *delayed_refs;
2974         struct btrfs_delayed_ref_head *head;
2975         int ret;
2976         int run_all = count == (unsigned long)-1;
2977
2978         /* We'll clean this up in btrfs_cleanup_transaction */
2979         if (trans->aborted)
2980                 return 0;
2981
2982         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2983                 return 0;
2984
2985         delayed_refs = &trans->transaction->delayed_refs;
2986         if (count == 0)
2987                 count = atomic_read(&delayed_refs->num_entries) * 2;
2988
2989 again:
2990 #ifdef SCRAMBLE_DELAYED_REFS
2991         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2992 #endif
2993         ret = __btrfs_run_delayed_refs(trans, count);
2994         if (ret < 0) {
2995                 btrfs_abort_transaction(trans, ret);
2996                 return ret;
2997         }
2998
2999         if (run_all) {
3000                 if (!list_empty(&trans->new_bgs))
3001                         btrfs_create_pending_block_groups(trans);
3002
3003                 spin_lock(&delayed_refs->lock);
3004                 node = rb_first_cached(&delayed_refs->href_root);
3005                 if (!node) {
3006                         spin_unlock(&delayed_refs->lock);
3007                         goto out;
3008                 }
3009                 head = rb_entry(node, struct btrfs_delayed_ref_head,
3010                                 href_node);
3011                 refcount_inc(&head->refs);
3012                 spin_unlock(&delayed_refs->lock);
3013
3014                 /* Mutex was contended, block until it's released and retry. */
3015                 mutex_lock(&head->mutex);
3016                 mutex_unlock(&head->mutex);
3017
3018                 btrfs_put_delayed_ref_head(head);
3019                 cond_resched();
3020                 goto again;
3021         }
3022 out:
3023         return 0;
3024 }
3025
3026 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3027                                 struct btrfs_fs_info *fs_info,
3028                                 u64 bytenr, u64 num_bytes, u64 flags,
3029                                 int level, int is_data)
3030 {
3031         struct btrfs_delayed_extent_op *extent_op;
3032         int ret;
3033
3034         extent_op = btrfs_alloc_delayed_extent_op();
3035         if (!extent_op)
3036                 return -ENOMEM;
3037
3038         extent_op->flags_to_set = flags;
3039         extent_op->update_flags = true;
3040         extent_op->update_key = false;
3041         extent_op->is_data = is_data ? true : false;
3042         extent_op->level = level;
3043
3044         ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3045                                           num_bytes, extent_op);
3046         if (ret)
3047                 btrfs_free_delayed_extent_op(extent_op);
3048         return ret;
3049 }
3050
3051 static noinline int check_delayed_ref(struct btrfs_root *root,
3052                                       struct btrfs_path *path,
3053                                       u64 objectid, u64 offset, u64 bytenr)
3054 {
3055         struct btrfs_delayed_ref_head *head;
3056         struct btrfs_delayed_ref_node *ref;
3057         struct btrfs_delayed_data_ref *data_ref;
3058         struct btrfs_delayed_ref_root *delayed_refs;
3059         struct btrfs_transaction *cur_trans;
3060         struct rb_node *node;
3061         int ret = 0;
3062
3063         spin_lock(&root->fs_info->trans_lock);
3064         cur_trans = root->fs_info->running_transaction;
3065         if (cur_trans)
3066                 refcount_inc(&cur_trans->use_count);
3067         spin_unlock(&root->fs_info->trans_lock);
3068         if (!cur_trans)
3069                 return 0;
3070
3071         delayed_refs = &cur_trans->delayed_refs;
3072         spin_lock(&delayed_refs->lock);
3073         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3074         if (!head) {
3075                 spin_unlock(&delayed_refs->lock);
3076                 btrfs_put_transaction(cur_trans);
3077                 return 0;
3078         }
3079
3080         if (!mutex_trylock(&head->mutex)) {
3081                 refcount_inc(&head->refs);
3082                 spin_unlock(&delayed_refs->lock);
3083
3084                 btrfs_release_path(path);
3085
3086                 /*
3087