btrfs: use narrower type for btrfs_transaction::num_dirty_bgs
[sfrench/cifs-2.6.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/sched/signal.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/sort.h>
24 #include <linux/rcupdate.h>
25 #include <linux/kthread.h>
26 #include <linux/slab.h>
27 #include <linux/ratelimit.h>
28 #include <linux/percpu_counter.h>
29 #include <linux/lockdep.h>
30 #include "hash.h"
31 #include "tree-log.h"
32 #include "disk-io.h"
33 #include "print-tree.h"
34 #include "volumes.h"
35 #include "raid56.h"
36 #include "locking.h"
37 #include "free-space-cache.h"
38 #include "free-space-tree.h"
39 #include "math.h"
40 #include "sysfs.h"
41 #include "qgroup.h"
42 #include "ref-verify.h"
43
44 #undef SCRAMBLE_DELAYED_REFS
45
46 /*
47  * control flags for do_chunk_alloc's force field
48  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
49  * if we really need one.
50  *
51  * CHUNK_ALLOC_LIMITED means to only try and allocate one
52  * if we have very few chunks already allocated.  This is
53  * used as part of the clustering code to help make sure
54  * we have a good pool of storage to cluster in, without
55  * filling the FS with empty chunks
56  *
57  * CHUNK_ALLOC_FORCE means it must try to allocate one
58  *
59  */
60 enum {
61         CHUNK_ALLOC_NO_FORCE = 0,
62         CHUNK_ALLOC_LIMITED = 1,
63         CHUNK_ALLOC_FORCE = 2,
64 };
65
66 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
67                                struct btrfs_fs_info *fs_info,
68                                 struct btrfs_delayed_ref_node *node, u64 parent,
69                                 u64 root_objectid, u64 owner_objectid,
70                                 u64 owner_offset, int refs_to_drop,
71                                 struct btrfs_delayed_extent_op *extra_op);
72 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
73                                     struct extent_buffer *leaf,
74                                     struct btrfs_extent_item *ei);
75 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
76                                       struct btrfs_fs_info *fs_info,
77                                       u64 parent, u64 root_objectid,
78                                       u64 flags, u64 owner, u64 offset,
79                                       struct btrfs_key *ins, int ref_mod);
80 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
81                                      struct btrfs_fs_info *fs_info,
82                                      u64 parent, u64 root_objectid,
83                                      u64 flags, struct btrfs_disk_key *key,
84                                      int level, struct btrfs_key *ins);
85 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
86                           struct btrfs_fs_info *fs_info, u64 flags,
87                           int force);
88 static int find_next_key(struct btrfs_path *path, int level,
89                          struct btrfs_key *key);
90 static void dump_space_info(struct btrfs_fs_info *fs_info,
91                             struct btrfs_space_info *info, u64 bytes,
92                             int dump_block_groups);
93 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
94                                u64 num_bytes);
95 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
96                                      struct btrfs_space_info *space_info,
97                                      u64 num_bytes);
98 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
99                                      struct btrfs_space_info *space_info,
100                                      u64 num_bytes);
101
102 static noinline int
103 block_group_cache_done(struct btrfs_block_group_cache *cache)
104 {
105         smp_mb();
106         return cache->cached == BTRFS_CACHE_FINISHED ||
107                 cache->cached == BTRFS_CACHE_ERROR;
108 }
109
110 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
111 {
112         return (cache->flags & bits) == bits;
113 }
114
115 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
116 {
117         atomic_inc(&cache->count);
118 }
119
120 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
121 {
122         if (atomic_dec_and_test(&cache->count)) {
123                 WARN_ON(cache->pinned > 0);
124                 WARN_ON(cache->reserved > 0);
125
126                 /*
127                  * If not empty, someone is still holding mutex of
128                  * full_stripe_lock, which can only be released by caller.
129                  * And it will definitely cause use-after-free when caller
130                  * tries to release full stripe lock.
131                  *
132                  * No better way to resolve, but only to warn.
133                  */
134                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
135                 kfree(cache->free_space_ctl);
136                 kfree(cache);
137         }
138 }
139
140 /*
141  * this adds the block group to the fs_info rb tree for the block group
142  * cache
143  */
144 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
145                                 struct btrfs_block_group_cache *block_group)
146 {
147         struct rb_node **p;
148         struct rb_node *parent = NULL;
149         struct btrfs_block_group_cache *cache;
150
151         spin_lock(&info->block_group_cache_lock);
152         p = &info->block_group_cache_tree.rb_node;
153
154         while (*p) {
155                 parent = *p;
156                 cache = rb_entry(parent, struct btrfs_block_group_cache,
157                                  cache_node);
158                 if (block_group->key.objectid < cache->key.objectid) {
159                         p = &(*p)->rb_left;
160                 } else if (block_group->key.objectid > cache->key.objectid) {
161                         p = &(*p)->rb_right;
162                 } else {
163                         spin_unlock(&info->block_group_cache_lock);
164                         return -EEXIST;
165                 }
166         }
167
168         rb_link_node(&block_group->cache_node, parent, p);
169         rb_insert_color(&block_group->cache_node,
170                         &info->block_group_cache_tree);
171
172         if (info->first_logical_byte > block_group->key.objectid)
173                 info->first_logical_byte = block_group->key.objectid;
174
175         spin_unlock(&info->block_group_cache_lock);
176
177         return 0;
178 }
179
180 /*
181  * This will return the block group at or after bytenr if contains is 0, else
182  * it will return the block group that contains the bytenr
183  */
184 static struct btrfs_block_group_cache *
185 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
186                               int contains)
187 {
188         struct btrfs_block_group_cache *cache, *ret = NULL;
189         struct rb_node *n;
190         u64 end, start;
191
192         spin_lock(&info->block_group_cache_lock);
193         n = info->block_group_cache_tree.rb_node;
194
195         while (n) {
196                 cache = rb_entry(n, struct btrfs_block_group_cache,
197                                  cache_node);
198                 end = cache->key.objectid + cache->key.offset - 1;
199                 start = cache->key.objectid;
200
201                 if (bytenr < start) {
202                         if (!contains && (!ret || start < ret->key.objectid))
203                                 ret = cache;
204                         n = n->rb_left;
205                 } else if (bytenr > start) {
206                         if (contains && bytenr <= end) {
207                                 ret = cache;
208                                 break;
209                         }
210                         n = n->rb_right;
211                 } else {
212                         ret = cache;
213                         break;
214                 }
215         }
216         if (ret) {
217                 btrfs_get_block_group(ret);
218                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
219                         info->first_logical_byte = ret->key.objectid;
220         }
221         spin_unlock(&info->block_group_cache_lock);
222
223         return ret;
224 }
225
226 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
227                                u64 start, u64 num_bytes)
228 {
229         u64 end = start + num_bytes - 1;
230         set_extent_bits(&fs_info->freed_extents[0],
231                         start, end, EXTENT_UPTODATE);
232         set_extent_bits(&fs_info->freed_extents[1],
233                         start, end, EXTENT_UPTODATE);
234         return 0;
235 }
236
237 static void free_excluded_extents(struct btrfs_fs_info *fs_info,
238                                   struct btrfs_block_group_cache *cache)
239 {
240         u64 start, end;
241
242         start = cache->key.objectid;
243         end = start + cache->key.offset - 1;
244
245         clear_extent_bits(&fs_info->freed_extents[0],
246                           start, end, EXTENT_UPTODATE);
247         clear_extent_bits(&fs_info->freed_extents[1],
248                           start, end, EXTENT_UPTODATE);
249 }
250
251 static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
252                                  struct btrfs_block_group_cache *cache)
253 {
254         u64 bytenr;
255         u64 *logical;
256         int stripe_len;
257         int i, nr, ret;
258
259         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
260                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
261                 cache->bytes_super += stripe_len;
262                 ret = add_excluded_extent(fs_info, cache->key.objectid,
263                                           stripe_len);
264                 if (ret)
265                         return ret;
266         }
267
268         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
269                 bytenr = btrfs_sb_offset(i);
270                 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
271                                        bytenr, 0, &logical, &nr, &stripe_len);
272                 if (ret)
273                         return ret;
274
275                 while (nr--) {
276                         u64 start, len;
277
278                         if (logical[nr] > cache->key.objectid +
279                             cache->key.offset)
280                                 continue;
281
282                         if (logical[nr] + stripe_len <= cache->key.objectid)
283                                 continue;
284
285                         start = logical[nr];
286                         if (start < cache->key.objectid) {
287                                 start = cache->key.objectid;
288                                 len = (logical[nr] + stripe_len) - start;
289                         } else {
290                                 len = min_t(u64, stripe_len,
291                                             cache->key.objectid +
292                                             cache->key.offset - start);
293                         }
294
295                         cache->bytes_super += len;
296                         ret = add_excluded_extent(fs_info, start, len);
297                         if (ret) {
298                                 kfree(logical);
299                                 return ret;
300                         }
301                 }
302
303                 kfree(logical);
304         }
305         return 0;
306 }
307
308 static struct btrfs_caching_control *
309 get_caching_control(struct btrfs_block_group_cache *cache)
310 {
311         struct btrfs_caching_control *ctl;
312
313         spin_lock(&cache->lock);
314         if (!cache->caching_ctl) {
315                 spin_unlock(&cache->lock);
316                 return NULL;
317         }
318
319         ctl = cache->caching_ctl;
320         refcount_inc(&ctl->count);
321         spin_unlock(&cache->lock);
322         return ctl;
323 }
324
325 static void put_caching_control(struct btrfs_caching_control *ctl)
326 {
327         if (refcount_dec_and_test(&ctl->count))
328                 kfree(ctl);
329 }
330
331 #ifdef CONFIG_BTRFS_DEBUG
332 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
333 {
334         struct btrfs_fs_info *fs_info = block_group->fs_info;
335         u64 start = block_group->key.objectid;
336         u64 len = block_group->key.offset;
337         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
338                 fs_info->nodesize : fs_info->sectorsize;
339         u64 step = chunk << 1;
340
341         while (len > chunk) {
342                 btrfs_remove_free_space(block_group, start, chunk);
343                 start += step;
344                 if (len < step)
345                         len = 0;
346                 else
347                         len -= step;
348         }
349 }
350 #endif
351
352 /*
353  * this is only called by cache_block_group, since we could have freed extents
354  * we need to check the pinned_extents for any extents that can't be used yet
355  * since their free space will be released as soon as the transaction commits.
356  */
357 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
358                        struct btrfs_fs_info *info, u64 start, u64 end)
359 {
360         u64 extent_start, extent_end, size, total_added = 0;
361         int ret;
362
363         while (start < end) {
364                 ret = find_first_extent_bit(info->pinned_extents, start,
365                                             &extent_start, &extent_end,
366                                             EXTENT_DIRTY | EXTENT_UPTODATE,
367                                             NULL);
368                 if (ret)
369                         break;
370
371                 if (extent_start <= start) {
372                         start = extent_end + 1;
373                 } else if (extent_start > start && extent_start < end) {
374                         size = extent_start - start;
375                         total_added += size;
376                         ret = btrfs_add_free_space(block_group, start,
377                                                    size);
378                         BUG_ON(ret); /* -ENOMEM or logic error */
379                         start = extent_end + 1;
380                 } else {
381                         break;
382                 }
383         }
384
385         if (start < end) {
386                 size = end - start;
387                 total_added += size;
388                 ret = btrfs_add_free_space(block_group, start, size);
389                 BUG_ON(ret); /* -ENOMEM or logic error */
390         }
391
392         return total_added;
393 }
394
395 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
396 {
397         struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
398         struct btrfs_fs_info *fs_info = block_group->fs_info;
399         struct btrfs_root *extent_root = fs_info->extent_root;
400         struct btrfs_path *path;
401         struct extent_buffer *leaf;
402         struct btrfs_key key;
403         u64 total_found = 0;
404         u64 last = 0;
405         u32 nritems;
406         int ret;
407         bool wakeup = true;
408
409         path = btrfs_alloc_path();
410         if (!path)
411                 return -ENOMEM;
412
413         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
414
415 #ifdef CONFIG_BTRFS_DEBUG
416         /*
417          * If we're fragmenting we don't want to make anybody think we can
418          * allocate from this block group until we've had a chance to fragment
419          * the free space.
420          */
421         if (btrfs_should_fragment_free_space(block_group))
422                 wakeup = false;
423 #endif
424         /*
425          * We don't want to deadlock with somebody trying to allocate a new
426          * extent for the extent root while also trying to search the extent
427          * root to add free space.  So we skip locking and search the commit
428          * root, since its read-only
429          */
430         path->skip_locking = 1;
431         path->search_commit_root = 1;
432         path->reada = READA_FORWARD;
433
434         key.objectid = last;
435         key.offset = 0;
436         key.type = BTRFS_EXTENT_ITEM_KEY;
437
438 next:
439         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
440         if (ret < 0)
441                 goto out;
442
443         leaf = path->nodes[0];
444         nritems = btrfs_header_nritems(leaf);
445
446         while (1) {
447                 if (btrfs_fs_closing(fs_info) > 1) {
448                         last = (u64)-1;
449                         break;
450                 }
451
452                 if (path->slots[0] < nritems) {
453                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
454                 } else {
455                         ret = find_next_key(path, 0, &key);
456                         if (ret)
457                                 break;
458
459                         if (need_resched() ||
460                             rwsem_is_contended(&fs_info->commit_root_sem)) {
461                                 if (wakeup)
462                                         caching_ctl->progress = last;
463                                 btrfs_release_path(path);
464                                 up_read(&fs_info->commit_root_sem);
465                                 mutex_unlock(&caching_ctl->mutex);
466                                 cond_resched();
467                                 mutex_lock(&caching_ctl->mutex);
468                                 down_read(&fs_info->commit_root_sem);
469                                 goto next;
470                         }
471
472                         ret = btrfs_next_leaf(extent_root, path);
473                         if (ret < 0)
474                                 goto out;
475                         if (ret)
476                                 break;
477                         leaf = path->nodes[0];
478                         nritems = btrfs_header_nritems(leaf);
479                         continue;
480                 }
481
482                 if (key.objectid < last) {
483                         key.objectid = last;
484                         key.offset = 0;
485                         key.type = BTRFS_EXTENT_ITEM_KEY;
486
487                         if (wakeup)
488                                 caching_ctl->progress = last;
489                         btrfs_release_path(path);
490                         goto next;
491                 }
492
493                 if (key.objectid < block_group->key.objectid) {
494                         path->slots[0]++;
495                         continue;
496                 }
497
498                 if (key.objectid >= block_group->key.objectid +
499                     block_group->key.offset)
500                         break;
501
502                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
503                     key.type == BTRFS_METADATA_ITEM_KEY) {
504                         total_found += add_new_free_space(block_group,
505                                                           fs_info, last,
506                                                           key.objectid);
507                         if (key.type == BTRFS_METADATA_ITEM_KEY)
508                                 last = key.objectid +
509                                         fs_info->nodesize;
510                         else
511                                 last = key.objectid + key.offset;
512
513                         if (total_found > CACHING_CTL_WAKE_UP) {
514                                 total_found = 0;
515                                 if (wakeup)
516                                         wake_up(&caching_ctl->wait);
517                         }
518                 }
519                 path->slots[0]++;
520         }
521         ret = 0;
522
523         total_found += add_new_free_space(block_group, fs_info, last,
524                                           block_group->key.objectid +
525                                           block_group->key.offset);
526         caching_ctl->progress = (u64)-1;
527
528 out:
529         btrfs_free_path(path);
530         return ret;
531 }
532
533 static noinline void caching_thread(struct btrfs_work *work)
534 {
535         struct btrfs_block_group_cache *block_group;
536         struct btrfs_fs_info *fs_info;
537         struct btrfs_caching_control *caching_ctl;
538         struct btrfs_root *extent_root;
539         int ret;
540
541         caching_ctl = container_of(work, struct btrfs_caching_control, work);
542         block_group = caching_ctl->block_group;
543         fs_info = block_group->fs_info;
544         extent_root = fs_info->extent_root;
545
546         mutex_lock(&caching_ctl->mutex);
547         down_read(&fs_info->commit_root_sem);
548
549         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
550                 ret = load_free_space_tree(caching_ctl);
551         else
552                 ret = load_extent_tree_free(caching_ctl);
553
554         spin_lock(&block_group->lock);
555         block_group->caching_ctl = NULL;
556         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
557         spin_unlock(&block_group->lock);
558
559 #ifdef CONFIG_BTRFS_DEBUG
560         if (btrfs_should_fragment_free_space(block_group)) {
561                 u64 bytes_used;
562
563                 spin_lock(&block_group->space_info->lock);
564                 spin_lock(&block_group->lock);
565                 bytes_used = block_group->key.offset -
566                         btrfs_block_group_used(&block_group->item);
567                 block_group->space_info->bytes_used += bytes_used >> 1;
568                 spin_unlock(&block_group->lock);
569                 spin_unlock(&block_group->space_info->lock);
570                 fragment_free_space(block_group);
571         }
572 #endif
573
574         caching_ctl->progress = (u64)-1;
575
576         up_read(&fs_info->commit_root_sem);
577         free_excluded_extents(fs_info, block_group);
578         mutex_unlock(&caching_ctl->mutex);
579
580         wake_up(&caching_ctl->wait);
581
582         put_caching_control(caching_ctl);
583         btrfs_put_block_group(block_group);
584 }
585
586 static int cache_block_group(struct btrfs_block_group_cache *cache,
587                              int load_cache_only)
588 {
589         DEFINE_WAIT(wait);
590         struct btrfs_fs_info *fs_info = cache->fs_info;
591         struct btrfs_caching_control *caching_ctl;
592         int ret = 0;
593
594         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
595         if (!caching_ctl)
596                 return -ENOMEM;
597
598         INIT_LIST_HEAD(&caching_ctl->list);
599         mutex_init(&caching_ctl->mutex);
600         init_waitqueue_head(&caching_ctl->wait);
601         caching_ctl->block_group = cache;
602         caching_ctl->progress = cache->key.objectid;
603         refcount_set(&caching_ctl->count, 1);
604         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
605                         caching_thread, NULL, NULL);
606
607         spin_lock(&cache->lock);
608         /*
609          * This should be a rare occasion, but this could happen I think in the
610          * case where one thread starts to load the space cache info, and then
611          * some other thread starts a transaction commit which tries to do an
612          * allocation while the other thread is still loading the space cache
613          * info.  The previous loop should have kept us from choosing this block
614          * group, but if we've moved to the state where we will wait on caching
615          * block groups we need to first check if we're doing a fast load here,
616          * so we can wait for it to finish, otherwise we could end up allocating
617          * from a block group who's cache gets evicted for one reason or
618          * another.
619          */
620         while (cache->cached == BTRFS_CACHE_FAST) {
621                 struct btrfs_caching_control *ctl;
622
623                 ctl = cache->caching_ctl;
624                 refcount_inc(&ctl->count);
625                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
626                 spin_unlock(&cache->lock);
627
628                 schedule();
629
630                 finish_wait(&ctl->wait, &wait);
631                 put_caching_control(ctl);
632                 spin_lock(&cache->lock);
633         }
634
635         if (cache->cached != BTRFS_CACHE_NO) {
636                 spin_unlock(&cache->lock);
637                 kfree(caching_ctl);
638                 return 0;
639         }
640         WARN_ON(cache->caching_ctl);
641         cache->caching_ctl = caching_ctl;
642         cache->cached = BTRFS_CACHE_FAST;
643         spin_unlock(&cache->lock);
644
645         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
646                 mutex_lock(&caching_ctl->mutex);
647                 ret = load_free_space_cache(fs_info, cache);
648
649                 spin_lock(&cache->lock);
650                 if (ret == 1) {
651                         cache->caching_ctl = NULL;
652                         cache->cached = BTRFS_CACHE_FINISHED;
653                         cache->last_byte_to_unpin = (u64)-1;
654                         caching_ctl->progress = (u64)-1;
655                 } else {
656                         if (load_cache_only) {
657                                 cache->caching_ctl = NULL;
658                                 cache->cached = BTRFS_CACHE_NO;
659                         } else {
660                                 cache->cached = BTRFS_CACHE_STARTED;
661                                 cache->has_caching_ctl = 1;
662                         }
663                 }
664                 spin_unlock(&cache->lock);
665 #ifdef CONFIG_BTRFS_DEBUG
666                 if (ret == 1 &&
667                     btrfs_should_fragment_free_space(cache)) {
668                         u64 bytes_used;
669
670                         spin_lock(&cache->space_info->lock);
671                         spin_lock(&cache->lock);
672                         bytes_used = cache->key.offset -
673                                 btrfs_block_group_used(&cache->item);
674                         cache->space_info->bytes_used += bytes_used >> 1;
675                         spin_unlock(&cache->lock);
676                         spin_unlock(&cache->space_info->lock);
677                         fragment_free_space(cache);
678                 }
679 #endif
680                 mutex_unlock(&caching_ctl->mutex);
681
682                 wake_up(&caching_ctl->wait);
683                 if (ret == 1) {
684                         put_caching_control(caching_ctl);
685                         free_excluded_extents(fs_info, cache);
686                         return 0;
687                 }
688         } else {
689                 /*
690                  * We're either using the free space tree or no caching at all.
691                  * Set cached to the appropriate value and wakeup any waiters.
692                  */
693                 spin_lock(&cache->lock);
694                 if (load_cache_only) {
695                         cache->caching_ctl = NULL;
696                         cache->cached = BTRFS_CACHE_NO;
697                 } else {
698                         cache->cached = BTRFS_CACHE_STARTED;
699                         cache->has_caching_ctl = 1;
700                 }
701                 spin_unlock(&cache->lock);
702                 wake_up(&caching_ctl->wait);
703         }
704
705         if (load_cache_only) {
706                 put_caching_control(caching_ctl);
707                 return 0;
708         }
709
710         down_write(&fs_info->commit_root_sem);
711         refcount_inc(&caching_ctl->count);
712         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
713         up_write(&fs_info->commit_root_sem);
714
715         btrfs_get_block_group(cache);
716
717         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
718
719         return ret;
720 }
721
722 /*
723  * return the block group that starts at or after bytenr
724  */
725 static struct btrfs_block_group_cache *
726 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
727 {
728         return block_group_cache_tree_search(info, bytenr, 0);
729 }
730
731 /*
732  * return the block group that contains the given bytenr
733  */
734 struct btrfs_block_group_cache *btrfs_lookup_block_group(
735                                                  struct btrfs_fs_info *info,
736                                                  u64 bytenr)
737 {
738         return block_group_cache_tree_search(info, bytenr, 1);
739 }
740
741 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
742                                                   u64 flags)
743 {
744         struct list_head *head = &info->space_info;
745         struct btrfs_space_info *found;
746
747         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
748
749         rcu_read_lock();
750         list_for_each_entry_rcu(found, head, list) {
751                 if (found->flags & flags) {
752                         rcu_read_unlock();
753                         return found;
754                 }
755         }
756         rcu_read_unlock();
757         return NULL;
758 }
759
760 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
761                              u64 owner, u64 root_objectid)
762 {
763         struct btrfs_space_info *space_info;
764         u64 flags;
765
766         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
767                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
768                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
769                 else
770                         flags = BTRFS_BLOCK_GROUP_METADATA;
771         } else {
772                 flags = BTRFS_BLOCK_GROUP_DATA;
773         }
774
775         space_info = __find_space_info(fs_info, flags);
776         ASSERT(space_info);
777         percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
778 }
779
780 /*
781  * after adding space to the filesystem, we need to clear the full flags
782  * on all the space infos.
783  */
784 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
785 {
786         struct list_head *head = &info->space_info;
787         struct btrfs_space_info *found;
788
789         rcu_read_lock();
790         list_for_each_entry_rcu(found, head, list)
791                 found->full = 0;
792         rcu_read_unlock();
793 }
794
795 /* simple helper to search for an existing data extent at a given offset */
796 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
797 {
798         int ret;
799         struct btrfs_key key;
800         struct btrfs_path *path;
801
802         path = btrfs_alloc_path();
803         if (!path)
804                 return -ENOMEM;
805
806         key.objectid = start;
807         key.offset = len;
808         key.type = BTRFS_EXTENT_ITEM_KEY;
809         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
810         btrfs_free_path(path);
811         return ret;
812 }
813
814 /*
815  * helper function to lookup reference count and flags of a tree block.
816  *
817  * the head node for delayed ref is used to store the sum of all the
818  * reference count modifications queued up in the rbtree. the head
819  * node may also store the extent flags to set. This way you can check
820  * to see what the reference count and extent flags would be if all of
821  * the delayed refs are not processed.
822  */
823 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
824                              struct btrfs_fs_info *fs_info, u64 bytenr,
825                              u64 offset, int metadata, u64 *refs, u64 *flags)
826 {
827         struct btrfs_delayed_ref_head *head;
828         struct btrfs_delayed_ref_root *delayed_refs;
829         struct btrfs_path *path;
830         struct btrfs_extent_item *ei;
831         struct extent_buffer *leaf;
832         struct btrfs_key key;
833         u32 item_size;
834         u64 num_refs;
835         u64 extent_flags;
836         int ret;
837
838         /*
839          * If we don't have skinny metadata, don't bother doing anything
840          * different
841          */
842         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
843                 offset = fs_info->nodesize;
844                 metadata = 0;
845         }
846
847         path = btrfs_alloc_path();
848         if (!path)
849                 return -ENOMEM;
850
851         if (!trans) {
852                 path->skip_locking = 1;
853                 path->search_commit_root = 1;
854         }
855
856 search_again:
857         key.objectid = bytenr;
858         key.offset = offset;
859         if (metadata)
860                 key.type = BTRFS_METADATA_ITEM_KEY;
861         else
862                 key.type = BTRFS_EXTENT_ITEM_KEY;
863
864         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
865         if (ret < 0)
866                 goto out_free;
867
868         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
869                 if (path->slots[0]) {
870                         path->slots[0]--;
871                         btrfs_item_key_to_cpu(path->nodes[0], &key,
872                                               path->slots[0]);
873                         if (key.objectid == bytenr &&
874                             key.type == BTRFS_EXTENT_ITEM_KEY &&
875                             key.offset == fs_info->nodesize)
876                                 ret = 0;
877                 }
878         }
879
880         if (ret == 0) {
881                 leaf = path->nodes[0];
882                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
883                 if (item_size >= sizeof(*ei)) {
884                         ei = btrfs_item_ptr(leaf, path->slots[0],
885                                             struct btrfs_extent_item);
886                         num_refs = btrfs_extent_refs(leaf, ei);
887                         extent_flags = btrfs_extent_flags(leaf, ei);
888                 } else {
889 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
890                         struct btrfs_extent_item_v0 *ei0;
891                         BUG_ON(item_size != sizeof(*ei0));
892                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
893                                              struct btrfs_extent_item_v0);
894                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
895                         /* FIXME: this isn't correct for data */
896                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
897 #else
898                         BUG();
899 #endif
900                 }
901                 BUG_ON(num_refs == 0);
902         } else {
903                 num_refs = 0;
904                 extent_flags = 0;
905                 ret = 0;
906         }
907
908         if (!trans)
909                 goto out;
910
911         delayed_refs = &trans->transaction->delayed_refs;
912         spin_lock(&delayed_refs->lock);
913         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
914         if (head) {
915                 if (!mutex_trylock(&head->mutex)) {
916                         refcount_inc(&head->refs);
917                         spin_unlock(&delayed_refs->lock);
918
919                         btrfs_release_path(path);
920
921                         /*
922                          * Mutex was contended, block until it's released and try
923                          * again
924                          */
925                         mutex_lock(&head->mutex);
926                         mutex_unlock(&head->mutex);
927                         btrfs_put_delayed_ref_head(head);
928                         goto search_again;
929                 }
930                 spin_lock(&head->lock);
931                 if (head->extent_op && head->extent_op->update_flags)
932                         extent_flags |= head->extent_op->flags_to_set;
933                 else
934                         BUG_ON(num_refs == 0);
935
936                 num_refs += head->ref_mod;
937                 spin_unlock(&head->lock);
938                 mutex_unlock(&head->mutex);
939         }
940         spin_unlock(&delayed_refs->lock);
941 out:
942         WARN_ON(num_refs == 0);
943         if (refs)
944                 *refs = num_refs;
945         if (flags)
946                 *flags = extent_flags;
947 out_free:
948         btrfs_free_path(path);
949         return ret;
950 }
951
952 /*
953  * Back reference rules.  Back refs have three main goals:
954  *
955  * 1) differentiate between all holders of references to an extent so that
956  *    when a reference is dropped we can make sure it was a valid reference
957  *    before freeing the extent.
958  *
959  * 2) Provide enough information to quickly find the holders of an extent
960  *    if we notice a given block is corrupted or bad.
961  *
962  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
963  *    maintenance.  This is actually the same as #2, but with a slightly
964  *    different use case.
965  *
966  * There are two kinds of back refs. The implicit back refs is optimized
967  * for pointers in non-shared tree blocks. For a given pointer in a block,
968  * back refs of this kind provide information about the block's owner tree
969  * and the pointer's key. These information allow us to find the block by
970  * b-tree searching. The full back refs is for pointers in tree blocks not
971  * referenced by their owner trees. The location of tree block is recorded
972  * in the back refs. Actually the full back refs is generic, and can be
973  * used in all cases the implicit back refs is used. The major shortcoming
974  * of the full back refs is its overhead. Every time a tree block gets
975  * COWed, we have to update back refs entry for all pointers in it.
976  *
977  * For a newly allocated tree block, we use implicit back refs for
978  * pointers in it. This means most tree related operations only involve
979  * implicit back refs. For a tree block created in old transaction, the
980  * only way to drop a reference to it is COW it. So we can detect the
981  * event that tree block loses its owner tree's reference and do the
982  * back refs conversion.
983  *
984  * When a tree block is COWed through a tree, there are four cases:
985  *
986  * The reference count of the block is one and the tree is the block's
987  * owner tree. Nothing to do in this case.
988  *
989  * The reference count of the block is one and the tree is not the
990  * block's owner tree. In this case, full back refs is used for pointers
991  * in the block. Remove these full back refs, add implicit back refs for
992  * every pointers in the new block.
993  *
994  * The reference count of the block is greater than one and the tree is
995  * the block's owner tree. In this case, implicit back refs is used for
996  * pointers in the block. Add full back refs for every pointers in the
997  * block, increase lower level extents' reference counts. The original
998  * implicit back refs are entailed to the new block.
999  *
1000  * The reference count of the block is greater than one and the tree is
1001  * not the block's owner tree. Add implicit back refs for every pointer in
1002  * the new block, increase lower level extents' reference count.
1003  *
1004  * Back Reference Key composing:
1005  *
1006  * The key objectid corresponds to the first byte in the extent,
1007  * The key type is used to differentiate between types of back refs.
1008  * There are different meanings of the key offset for different types
1009  * of back refs.
1010  *
1011  * File extents can be referenced by:
1012  *
1013  * - multiple snapshots, subvolumes, or different generations in one subvol
1014  * - different files inside a single subvolume
1015  * - different offsets inside a file (bookend extents in file.c)
1016  *
1017  * The extent ref structure for the implicit back refs has fields for:
1018  *
1019  * - Objectid of the subvolume root
1020  * - objectid of the file holding the reference
1021  * - original offset in the file
1022  * - how many bookend extents
1023  *
1024  * The key offset for the implicit back refs is hash of the first
1025  * three fields.
1026  *
1027  * The extent ref structure for the full back refs has field for:
1028  *
1029  * - number of pointers in the tree leaf
1030  *
1031  * The key offset for the implicit back refs is the first byte of
1032  * the tree leaf
1033  *
1034  * When a file extent is allocated, The implicit back refs is used.
1035  * the fields are filled in:
1036  *
1037  *     (root_key.objectid, inode objectid, offset in file, 1)
1038  *
1039  * When a file extent is removed file truncation, we find the
1040  * corresponding implicit back refs and check the following fields:
1041  *
1042  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1043  *
1044  * Btree extents can be referenced by:
1045  *
1046  * - Different subvolumes
1047  *
1048  * Both the implicit back refs and the full back refs for tree blocks
1049  * only consist of key. The key offset for the implicit back refs is
1050  * objectid of block's owner tree. The key offset for the full back refs
1051  * is the first byte of parent block.
1052  *
1053  * When implicit back refs is used, information about the lowest key and
1054  * level of the tree block are required. These information are stored in
1055  * tree block info structure.
1056  */
1057
1058 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1059 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1060                                   struct btrfs_fs_info *fs_info,
1061                                   struct btrfs_path *path,
1062                                   u64 owner, u32 extra_size)
1063 {
1064         struct btrfs_root *root = fs_info->extent_root;
1065         struct btrfs_extent_item *item;
1066         struct btrfs_extent_item_v0 *ei0;
1067         struct btrfs_extent_ref_v0 *ref0;
1068         struct btrfs_tree_block_info *bi;
1069         struct extent_buffer *leaf;
1070         struct btrfs_key key;
1071         struct btrfs_key found_key;
1072         u32 new_size = sizeof(*item);
1073         u64 refs;
1074         int ret;
1075
1076         leaf = path->nodes[0];
1077         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1078
1079         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1080         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1081                              struct btrfs_extent_item_v0);
1082         refs = btrfs_extent_refs_v0(leaf, ei0);
1083
1084         if (owner == (u64)-1) {
1085                 while (1) {
1086                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1087                                 ret = btrfs_next_leaf(root, path);
1088                                 if (ret < 0)
1089                                         return ret;
1090                                 BUG_ON(ret > 0); /* Corruption */
1091                                 leaf = path->nodes[0];
1092                         }
1093                         btrfs_item_key_to_cpu(leaf, &found_key,
1094                                               path->slots[0]);
1095                         BUG_ON(key.objectid != found_key.objectid);
1096                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1097                                 path->slots[0]++;
1098                                 continue;
1099                         }
1100                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1101                                               struct btrfs_extent_ref_v0);
1102                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1103                         break;
1104                 }
1105         }
1106         btrfs_release_path(path);
1107
1108         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1109                 new_size += sizeof(*bi);
1110
1111         new_size -= sizeof(*ei0);
1112         ret = btrfs_search_slot(trans, root, &key, path,
1113                                 new_size + extra_size, 1);
1114         if (ret < 0)
1115                 return ret;
1116         BUG_ON(ret); /* Corruption */
1117
1118         btrfs_extend_item(fs_info, path, new_size);
1119
1120         leaf = path->nodes[0];
1121         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1122         btrfs_set_extent_refs(leaf, item, refs);
1123         /* FIXME: get real generation */
1124         btrfs_set_extent_generation(leaf, item, 0);
1125         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1126                 btrfs_set_extent_flags(leaf, item,
1127                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1128                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1129                 bi = (struct btrfs_tree_block_info *)(item + 1);
1130                 /* FIXME: get first key of the block */
1131                 memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi));
1132                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1133         } else {
1134                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1135         }
1136         btrfs_mark_buffer_dirty(leaf);
1137         return 0;
1138 }
1139 #endif
1140
1141 /*
1142  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1143  * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
1144  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1145  */
1146 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1147                                      struct btrfs_extent_inline_ref *iref,
1148                                      enum btrfs_inline_ref_type is_data)
1149 {
1150         int type = btrfs_extent_inline_ref_type(eb, iref);
1151         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1152
1153         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1154             type == BTRFS_SHARED_BLOCK_REF_KEY ||
1155             type == BTRFS_SHARED_DATA_REF_KEY ||
1156             type == BTRFS_EXTENT_DATA_REF_KEY) {
1157                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1158                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
1159                                 return type;
1160                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1161                                 ASSERT(eb->fs_info);
1162                                 /*
1163                                  * Every shared one has parent tree
1164                                  * block, which must be aligned to
1165                                  * nodesize.
1166                                  */
1167                                 if (offset &&
1168                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1169                                         return type;
1170                         }
1171                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1172                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
1173                                 return type;
1174                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
1175                                 ASSERT(eb->fs_info);
1176                                 /*
1177                                  * Every shared one has parent tree
1178                                  * block, which must be aligned to
1179                                  * nodesize.
1180                                  */
1181                                 if (offset &&
1182                                     IS_ALIGNED(offset, eb->fs_info->nodesize))
1183                                         return type;
1184                         }
1185                 } else {
1186                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1187                         return type;
1188                 }
1189         }
1190
1191         btrfs_print_leaf((struct extent_buffer *)eb);
1192         btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1193                   eb->start, type);
1194         WARN_ON(1);
1195
1196         return BTRFS_REF_TYPE_INVALID;
1197 }
1198
1199 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1200 {
1201         u32 high_crc = ~(u32)0;
1202         u32 low_crc = ~(u32)0;
1203         __le64 lenum;
1204
1205         lenum = cpu_to_le64(root_objectid);
1206         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1207         lenum = cpu_to_le64(owner);
1208         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1209         lenum = cpu_to_le64(offset);
1210         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1211
1212         return ((u64)high_crc << 31) ^ (u64)low_crc;
1213 }
1214
1215 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1216                                      struct btrfs_extent_data_ref *ref)
1217 {
1218         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1219                                     btrfs_extent_data_ref_objectid(leaf, ref),
1220                                     btrfs_extent_data_ref_offset(leaf, ref));
1221 }
1222
1223 static int match_extent_data_ref(struct extent_buffer *leaf,
1224                                  struct btrfs_extent_data_ref *ref,
1225                                  u64 root_objectid, u64 owner, u64 offset)
1226 {
1227         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1228             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1229             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1230                 return 0;
1231         return 1;
1232 }
1233
1234 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1235                                            struct btrfs_fs_info *fs_info,
1236                                            struct btrfs_path *path,
1237                                            u64 bytenr, u64 parent,
1238                                            u64 root_objectid,
1239                                            u64 owner, u64 offset)
1240 {
1241         struct btrfs_root *root = fs_info->extent_root;
1242         struct btrfs_key key;
1243         struct btrfs_extent_data_ref *ref;
1244         struct extent_buffer *leaf;
1245         u32 nritems;
1246         int ret;
1247         int recow;
1248         int err = -ENOENT;
1249
1250         key.objectid = bytenr;
1251         if (parent) {
1252                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1253                 key.offset = parent;
1254         } else {
1255                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1256                 key.offset = hash_extent_data_ref(root_objectid,
1257                                                   owner, offset);
1258         }
1259 again:
1260         recow = 0;
1261         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1262         if (ret < 0) {
1263                 err = ret;
1264                 goto fail;
1265         }
1266
1267         if (parent) {
1268                 if (!ret)
1269                         return 0;
1270 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1271                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1272                 btrfs_release_path(path);
1273                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1274                 if (ret < 0) {
1275                         err = ret;
1276                         goto fail;
1277                 }
1278                 if (!ret)
1279                         return 0;
1280 #endif
1281                 goto fail;
1282         }
1283
1284         leaf = path->nodes[0];
1285         nritems = btrfs_header_nritems(leaf);
1286         while (1) {
1287                 if (path->slots[0] >= nritems) {
1288                         ret = btrfs_next_leaf(root, path);
1289                         if (ret < 0)
1290                                 err = ret;
1291                         if (ret)
1292                                 goto fail;
1293
1294                         leaf = path->nodes[0];
1295                         nritems = btrfs_header_nritems(leaf);
1296                         recow = 1;
1297                 }
1298
1299                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1300                 if (key.objectid != bytenr ||
1301                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1302                         goto fail;
1303
1304                 ref = btrfs_item_ptr(leaf, path->slots[0],
1305                                      struct btrfs_extent_data_ref);
1306
1307                 if (match_extent_data_ref(leaf, ref, root_objectid,
1308                                           owner, offset)) {
1309                         if (recow) {
1310                                 btrfs_release_path(path);
1311                                 goto again;
1312                         }
1313                         err = 0;
1314                         break;
1315                 }
1316                 path->slots[0]++;
1317         }
1318 fail:
1319         return err;
1320 }
1321
1322 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1323                                            struct btrfs_fs_info *fs_info,
1324                                            struct btrfs_path *path,
1325                                            u64 bytenr, u64 parent,
1326                                            u64 root_objectid, u64 owner,
1327                                            u64 offset, int refs_to_add)
1328 {
1329         struct btrfs_root *root = fs_info->extent_root;
1330         struct btrfs_key key;
1331         struct extent_buffer *leaf;
1332         u32 size;
1333         u32 num_refs;
1334         int ret;
1335
1336         key.objectid = bytenr;
1337         if (parent) {
1338                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1339                 key.offset = parent;
1340                 size = sizeof(struct btrfs_shared_data_ref);
1341         } else {
1342                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1343                 key.offset = hash_extent_data_ref(root_objectid,
1344                                                   owner, offset);
1345                 size = sizeof(struct btrfs_extent_data_ref);
1346         }
1347
1348         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1349         if (ret && ret != -EEXIST)
1350                 goto fail;
1351
1352         leaf = path->nodes[0];
1353         if (parent) {
1354                 struct btrfs_shared_data_ref *ref;
1355                 ref = btrfs_item_ptr(leaf, path->slots[0],
1356                                      struct btrfs_shared_data_ref);
1357                 if (ret == 0) {
1358                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1359                 } else {
1360                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1361                         num_refs += refs_to_add;
1362                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1363                 }
1364         } else {
1365                 struct btrfs_extent_data_ref *ref;
1366                 while (ret == -EEXIST) {
1367                         ref = btrfs_item_ptr(leaf, path->slots[0],
1368                                              struct btrfs_extent_data_ref);
1369                         if (match_extent_data_ref(leaf, ref, root_objectid,
1370                                                   owner, offset))
1371                                 break;
1372                         btrfs_release_path(path);
1373                         key.offset++;
1374                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1375                                                       size);
1376                         if (ret && ret != -EEXIST)
1377                                 goto fail;
1378
1379                         leaf = path->nodes[0];
1380                 }
1381                 ref = btrfs_item_ptr(leaf, path->slots[0],
1382                                      struct btrfs_extent_data_ref);
1383                 if (ret == 0) {
1384                         btrfs_set_extent_data_ref_root(leaf, ref,
1385                                                        root_objectid);
1386                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1387                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1388                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1389                 } else {
1390                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1391                         num_refs += refs_to_add;
1392                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1393                 }
1394         }
1395         btrfs_mark_buffer_dirty(leaf);
1396         ret = 0;
1397 fail:
1398         btrfs_release_path(path);
1399         return ret;
1400 }
1401
1402 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1403                                            struct btrfs_fs_info *fs_info,
1404                                            struct btrfs_path *path,
1405                                            int refs_to_drop, int *last_ref)
1406 {
1407         struct btrfs_key key;
1408         struct btrfs_extent_data_ref *ref1 = NULL;
1409         struct btrfs_shared_data_ref *ref2 = NULL;
1410         struct extent_buffer *leaf;
1411         u32 num_refs = 0;
1412         int ret = 0;
1413
1414         leaf = path->nodes[0];
1415         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1416
1417         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1418                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1419                                       struct btrfs_extent_data_ref);
1420                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1421         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1422                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1423                                       struct btrfs_shared_data_ref);
1424                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1425 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1426         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1427                 struct btrfs_extent_ref_v0 *ref0;
1428                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1429                                       struct btrfs_extent_ref_v0);
1430                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1431 #endif
1432         } else {
1433                 BUG();
1434         }
1435
1436         BUG_ON(num_refs < refs_to_drop);
1437         num_refs -= refs_to_drop;
1438
1439         if (num_refs == 0) {
1440                 ret = btrfs_del_item(trans, fs_info->extent_root, path);
1441                 *last_ref = 1;
1442         } else {
1443                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1444                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1445                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1446                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1447 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1448                 else {
1449                         struct btrfs_extent_ref_v0 *ref0;
1450                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1451                                         struct btrfs_extent_ref_v0);
1452                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1453                 }
1454 #endif
1455                 btrfs_mark_buffer_dirty(leaf);
1456         }
1457         return ret;
1458 }
1459
1460 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1461                                           struct btrfs_extent_inline_ref *iref)
1462 {
1463         struct btrfs_key key;
1464         struct extent_buffer *leaf;
1465         struct btrfs_extent_data_ref *ref1;
1466         struct btrfs_shared_data_ref *ref2;
1467         u32 num_refs = 0;
1468         int type;
1469
1470         leaf = path->nodes[0];
1471         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1472         if (iref) {
1473                 /*
1474                  * If type is invalid, we should have bailed out earlier than
1475                  * this call.
1476                  */
1477                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1478                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1479                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1480                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1481                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1482                 } else {
1483                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1484                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1485                 }
1486         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1487                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1488                                       struct btrfs_extent_data_ref);
1489                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1490         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1491                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1492                                       struct btrfs_shared_data_ref);
1493                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1494 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1495         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1496                 struct btrfs_extent_ref_v0 *ref0;
1497                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1498                                       struct btrfs_extent_ref_v0);
1499                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1500 #endif
1501         } else {
1502                 WARN_ON(1);
1503         }
1504         return num_refs;
1505 }
1506
1507 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1508                                           struct btrfs_fs_info *fs_info,
1509                                           struct btrfs_path *path,
1510                                           u64 bytenr, u64 parent,
1511                                           u64 root_objectid)
1512 {
1513         struct btrfs_root *root = fs_info->extent_root;
1514         struct btrfs_key key;
1515         int ret;
1516
1517         key.objectid = bytenr;
1518         if (parent) {
1519                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1520                 key.offset = parent;
1521         } else {
1522                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1523                 key.offset = root_objectid;
1524         }
1525
1526         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1527         if (ret > 0)
1528                 ret = -ENOENT;
1529 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1530         if (ret == -ENOENT && parent) {
1531                 btrfs_release_path(path);
1532                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1533                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1534                 if (ret > 0)
1535                         ret = -ENOENT;
1536         }
1537 #endif
1538         return ret;
1539 }
1540
1541 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1542                                           struct btrfs_fs_info *fs_info,
1543                                           struct btrfs_path *path,
1544                                           u64 bytenr, u64 parent,
1545                                           u64 root_objectid)
1546 {
1547         struct btrfs_key key;
1548         int ret;
1549
1550         key.objectid = bytenr;
1551         if (parent) {
1552                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1553                 key.offset = parent;
1554         } else {
1555                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1556                 key.offset = root_objectid;
1557         }
1558
1559         ret = btrfs_insert_empty_item(trans, fs_info->extent_root,
1560                                       path, &key, 0);
1561         btrfs_release_path(path);
1562         return ret;
1563 }
1564
1565 static inline int extent_ref_type(u64 parent, u64 owner)
1566 {
1567         int type;
1568         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1569                 if (parent > 0)
1570                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1571                 else
1572                         type = BTRFS_TREE_BLOCK_REF_KEY;
1573         } else {
1574                 if (parent > 0)
1575                         type = BTRFS_SHARED_DATA_REF_KEY;
1576                 else
1577                         type = BTRFS_EXTENT_DATA_REF_KEY;
1578         }
1579         return type;
1580 }
1581
1582 static int find_next_key(struct btrfs_path *path, int level,
1583                          struct btrfs_key *key)
1584
1585 {
1586         for (; level < BTRFS_MAX_LEVEL; level++) {
1587                 if (!path->nodes[level])
1588                         break;
1589                 if (path->slots[level] + 1 >=
1590                     btrfs_header_nritems(path->nodes[level]))
1591                         continue;
1592                 if (level == 0)
1593                         btrfs_item_key_to_cpu(path->nodes[level], key,
1594                                               path->slots[level] + 1);
1595                 else
1596                         btrfs_node_key_to_cpu(path->nodes[level], key,
1597                                               path->slots[level] + 1);
1598                 return 0;
1599         }
1600         return 1;
1601 }
1602
1603 /*
1604  * look for inline back ref. if back ref is found, *ref_ret is set
1605  * to the address of inline back ref, and 0 is returned.
1606  *
1607  * if back ref isn't found, *ref_ret is set to the address where it
1608  * should be inserted, and -ENOENT is returned.
1609  *
1610  * if insert is true and there are too many inline back refs, the path
1611  * points to the extent item, and -EAGAIN is returned.
1612  *
1613  * NOTE: inline back refs are ordered in the same way that back ref
1614  *       items in the tree are ordered.
1615  */
1616 static noinline_for_stack
1617 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1618                                  struct btrfs_fs_info *fs_info,
1619                                  struct btrfs_path *path,
1620                                  struct btrfs_extent_inline_ref **ref_ret,
1621                                  u64 bytenr, u64 num_bytes,
1622                                  u64 parent, u64 root_objectid,
1623                                  u64 owner, u64 offset, int insert)
1624 {
1625         struct btrfs_root *root = fs_info->extent_root;
1626         struct btrfs_key key;
1627         struct extent_buffer *leaf;
1628         struct btrfs_extent_item *ei;
1629         struct btrfs_extent_inline_ref *iref;
1630         u64 flags;
1631         u64 item_size;
1632         unsigned long ptr;
1633         unsigned long end;
1634         int extra_size;
1635         int type;
1636         int want;
1637         int ret;
1638         int err = 0;
1639         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1640         int needed;
1641
1642         key.objectid = bytenr;
1643         key.type = BTRFS_EXTENT_ITEM_KEY;
1644         key.offset = num_bytes;
1645
1646         want = extent_ref_type(parent, owner);
1647         if (insert) {
1648                 extra_size = btrfs_extent_inline_ref_size(want);
1649                 path->keep_locks = 1;
1650         } else
1651                 extra_size = -1;
1652
1653         /*
1654          * Owner is our parent level, so we can just add one to get the level
1655          * for the block we are interested in.
1656          */
1657         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1658                 key.type = BTRFS_METADATA_ITEM_KEY;
1659                 key.offset = owner;
1660         }
1661
1662 again:
1663         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1664         if (ret < 0) {
1665                 err = ret;
1666                 goto out;
1667         }
1668
1669         /*
1670          * We may be a newly converted file system which still has the old fat
1671          * extent entries for metadata, so try and see if we have one of those.
1672          */
1673         if (ret > 0 && skinny_metadata) {
1674                 skinny_metadata = false;
1675                 if (path->slots[0]) {
1676                         path->slots[0]--;
1677                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1678                                               path->slots[0]);
1679                         if (key.objectid == bytenr &&
1680                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1681                             key.offset == num_bytes)
1682                                 ret = 0;
1683                 }
1684                 if (ret) {
1685                         key.objectid = bytenr;
1686                         key.type = BTRFS_EXTENT_ITEM_KEY;
1687                         key.offset = num_bytes;
1688                         btrfs_release_path(path);
1689                         goto again;
1690                 }
1691         }
1692
1693         if (ret && !insert) {
1694                 err = -ENOENT;
1695                 goto out;
1696         } else if (WARN_ON(ret)) {
1697                 err = -EIO;
1698                 goto out;
1699         }
1700
1701         leaf = path->nodes[0];
1702         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1703 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1704         if (item_size < sizeof(*ei)) {
1705                 if (!insert) {
1706                         err = -ENOENT;
1707                         goto out;
1708                 }
1709                 ret = convert_extent_item_v0(trans, fs_info, path, owner,
1710                                              extra_size);
1711                 if (ret < 0) {
1712                         err = ret;
1713                         goto out;
1714                 }
1715                 leaf = path->nodes[0];
1716                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1717         }
1718 #endif
1719         BUG_ON(item_size < sizeof(*ei));
1720
1721         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1722         flags = btrfs_extent_flags(leaf, ei);
1723
1724         ptr = (unsigned long)(ei + 1);
1725         end = (unsigned long)ei + item_size;
1726
1727         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1728                 ptr += sizeof(struct btrfs_tree_block_info);
1729                 BUG_ON(ptr > end);
1730         }
1731
1732         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1733                 needed = BTRFS_REF_TYPE_DATA;
1734         else
1735                 needed = BTRFS_REF_TYPE_BLOCK;
1736
1737         err = -ENOENT;
1738         while (1) {
1739                 if (ptr >= end) {
1740                         WARN_ON(ptr > end);
1741                         break;
1742                 }
1743                 iref = (struct btrfs_extent_inline_ref *)ptr;
1744                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1745                 if (type == BTRFS_REF_TYPE_INVALID) {
1746                         err = -EINVAL;
1747                         goto out;
1748                 }
1749
1750                 if (want < type)
1751                         break;
1752                 if (want > type) {
1753                         ptr += btrfs_extent_inline_ref_size(type);
1754                         continue;
1755                 }
1756
1757                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1758                         struct btrfs_extent_data_ref *dref;
1759                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1760                         if (match_extent_data_ref(leaf, dref, root_objectid,
1761                                                   owner, offset)) {
1762                                 err = 0;
1763                                 break;
1764                         }
1765                         if (hash_extent_data_ref_item(leaf, dref) <
1766                             hash_extent_data_ref(root_objectid, owner, offset))
1767                                 break;
1768                 } else {
1769                         u64 ref_offset;
1770                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1771                         if (parent > 0) {
1772                                 if (parent == ref_offset) {
1773                                         err = 0;
1774                                         break;
1775                                 }
1776                                 if (ref_offset < parent)
1777                                         break;
1778                         } else {
1779                                 if (root_objectid == ref_offset) {
1780                                         err = 0;
1781                                         break;
1782                                 }
1783                                 if (ref_offset < root_objectid)
1784                                         break;
1785                         }
1786                 }
1787                 ptr += btrfs_extent_inline_ref_size(type);
1788         }
1789         if (err == -ENOENT && insert) {
1790                 if (item_size + extra_size >=
1791                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1792                         err = -EAGAIN;
1793                         goto out;
1794                 }
1795                 /*
1796                  * To add new inline back ref, we have to make sure
1797                  * there is no corresponding back ref item.
1798                  * For simplicity, we just do not add new inline back
1799                  * ref if there is any kind of item for this block
1800                  */
1801                 if (find_next_key(path, 0, &key) == 0 &&
1802                     key.objectid == bytenr &&
1803                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1804                         err = -EAGAIN;
1805                         goto out;
1806                 }
1807         }
1808         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1809 out:
1810         if (insert) {
1811                 path->keep_locks = 0;
1812                 btrfs_unlock_up_safe(path, 1);
1813         }
1814         return err;
1815 }
1816
1817 /*
1818  * helper to add new inline back ref
1819  */
1820 static noinline_for_stack
1821 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1822                                  struct btrfs_path *path,
1823                                  struct btrfs_extent_inline_ref *iref,
1824                                  u64 parent, u64 root_objectid,
1825                                  u64 owner, u64 offset, int refs_to_add,
1826                                  struct btrfs_delayed_extent_op *extent_op)
1827 {
1828         struct extent_buffer *leaf;
1829         struct btrfs_extent_item *ei;
1830         unsigned long ptr;
1831         unsigned long end;
1832         unsigned long item_offset;
1833         u64 refs;
1834         int size;
1835         int type;
1836
1837         leaf = path->nodes[0];
1838         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1839         item_offset = (unsigned long)iref - (unsigned long)ei;
1840
1841         type = extent_ref_type(parent, owner);
1842         size = btrfs_extent_inline_ref_size(type);
1843
1844         btrfs_extend_item(fs_info, path, size);
1845
1846         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1847         refs = btrfs_extent_refs(leaf, ei);
1848         refs += refs_to_add;
1849         btrfs_set_extent_refs(leaf, ei, refs);
1850         if (extent_op)
1851                 __run_delayed_extent_op(extent_op, leaf, ei);
1852
1853         ptr = (unsigned long)ei + item_offset;
1854         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1855         if (ptr < end - size)
1856                 memmove_extent_buffer(leaf, ptr + size, ptr,
1857                                       end - size - ptr);
1858
1859         iref = (struct btrfs_extent_inline_ref *)ptr;
1860         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1861         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1862                 struct btrfs_extent_data_ref *dref;
1863                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1864                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1865                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1866                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1867                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1868         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1869                 struct btrfs_shared_data_ref *sref;
1870                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1871                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1872                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1873         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1874                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1875         } else {
1876                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1877         }
1878         btrfs_mark_buffer_dirty(leaf);
1879 }
1880
1881 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1882                                  struct btrfs_fs_info *fs_info,
1883                                  struct btrfs_path *path,
1884                                  struct btrfs_extent_inline_ref **ref_ret,
1885                                  u64 bytenr, u64 num_bytes, u64 parent,
1886                                  u64 root_objectid, u64 owner, u64 offset)
1887 {
1888         int ret;
1889
1890         ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret,
1891                                            bytenr, num_bytes, parent,
1892                                            root_objectid, owner, offset, 0);
1893         if (ret != -ENOENT)
1894                 return ret;
1895
1896         btrfs_release_path(path);
1897         *ref_ret = NULL;
1898
1899         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1900                 ret = lookup_tree_block_ref(trans, fs_info, path, bytenr,
1901                                             parent, root_objectid);
1902         } else {
1903                 ret = lookup_extent_data_ref(trans, fs_info, path, bytenr,
1904                                              parent, root_objectid, owner,
1905                                              offset);
1906         }
1907         return ret;
1908 }
1909
1910 /*
1911  * helper to update/remove inline back ref
1912  */
1913 static noinline_for_stack
1914 void update_inline_extent_backref(struct btrfs_fs_info *fs_info,
1915                                   struct btrfs_path *path,
1916                                   struct btrfs_extent_inline_ref *iref,
1917                                   int refs_to_mod,
1918                                   struct btrfs_delayed_extent_op *extent_op,
1919                                   int *last_ref)
1920 {
1921         struct extent_buffer *leaf;
1922         struct btrfs_extent_item *ei;
1923         struct btrfs_extent_data_ref *dref = NULL;
1924         struct btrfs_shared_data_ref *sref = NULL;
1925         unsigned long ptr;
1926         unsigned long end;
1927         u32 item_size;
1928         int size;
1929         int type;
1930         u64 refs;
1931
1932         leaf = path->nodes[0];
1933         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1934         refs = btrfs_extent_refs(leaf, ei);
1935         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1936         refs += refs_to_mod;
1937         btrfs_set_extent_refs(leaf, ei, refs);
1938         if (extent_op)
1939                 __run_delayed_extent_op(extent_op, leaf, ei);
1940
1941         /*
1942          * If type is invalid, we should have bailed out after
1943          * lookup_inline_extent_backref().
1944          */
1945         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1946         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1947
1948         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1949                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1950                 refs = btrfs_extent_data_ref_count(leaf, dref);
1951         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1952                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1953                 refs = btrfs_shared_data_ref_count(leaf, sref);
1954         } else {
1955                 refs = 1;
1956                 BUG_ON(refs_to_mod != -1);
1957         }
1958
1959         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1960         refs += refs_to_mod;
1961
1962         if (refs > 0) {
1963                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1964                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1965                 else
1966                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1967         } else {
1968                 *last_ref = 1;
1969                 size =  btrfs_extent_inline_ref_size(type);
1970                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1971                 ptr = (unsigned long)iref;
1972                 end = (unsigned long)ei + item_size;
1973                 if (ptr + size < end)
1974                         memmove_extent_buffer(leaf, ptr, ptr + size,
1975                                               end - ptr - size);
1976                 item_size -= size;
1977                 btrfs_truncate_item(fs_info, path, item_size, 1);
1978         }
1979         btrfs_mark_buffer_dirty(leaf);
1980 }
1981
1982 static noinline_for_stack
1983 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1984                                  struct btrfs_fs_info *fs_info,
1985                                  struct btrfs_path *path,
1986                                  u64 bytenr, u64 num_bytes, u64 parent,
1987                                  u64 root_objectid, u64 owner,
1988                                  u64 offset, int refs_to_add,
1989                                  struct btrfs_delayed_extent_op *extent_op)
1990 {
1991         struct btrfs_extent_inline_ref *iref;
1992         int ret;
1993
1994         ret = lookup_inline_extent_backref(trans, fs_info, path, &iref,
1995                                            bytenr, num_bytes, parent,
1996                                            root_objectid, owner, offset, 1);
1997         if (ret == 0) {
1998                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1999                 update_inline_extent_backref(fs_info, path, iref,
2000                                              refs_to_add, extent_op, NULL);
2001         } else if (ret == -ENOENT) {
2002                 setup_inline_extent_backref(fs_info, path, iref, parent,
2003                                             root_objectid, owner, offset,
2004                                             refs_to_add, extent_op);
2005                 ret = 0;
2006         }
2007         return ret;
2008 }
2009
2010 static int insert_extent_backref(struct btrfs_trans_handle *trans,
2011                                  struct btrfs_fs_info *fs_info,
2012                                  struct btrfs_path *path,
2013                                  u64 bytenr, u64 parent, u64 root_objectid,
2014                                  u64 owner, u64 offset, int refs_to_add)
2015 {
2016         int ret;
2017         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2018                 BUG_ON(refs_to_add != 1);
2019                 ret = insert_tree_block_ref(trans, fs_info, path, bytenr,
2020                                             parent, root_objectid);
2021         } else {
2022                 ret = insert_extent_data_ref(trans, fs_info, path, bytenr,
2023                                              parent, root_objectid,
2024                                              owner, offset, refs_to_add);
2025         }
2026         return ret;
2027 }
2028
2029 static int remove_extent_backref(struct btrfs_trans_handle *trans,
2030                                  struct btrfs_fs_info *fs_info,
2031                                  struct btrfs_path *path,
2032                                  struct btrfs_extent_inline_ref *iref,
2033                                  int refs_to_drop, int is_data, int *last_ref)
2034 {
2035         int ret = 0;
2036
2037         BUG_ON(!is_data && refs_to_drop != 1);
2038         if (iref) {
2039                 update_inline_extent_backref(fs_info, path, iref,
2040                                              -refs_to_drop, NULL, last_ref);
2041         } else if (is_data) {
2042                 ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop,
2043                                              last_ref);
2044         } else {
2045                 *last_ref = 1;
2046                 ret = btrfs_del_item(trans, fs_info->extent_root, path);
2047         }
2048         return ret;
2049 }
2050
2051 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
2052 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
2053                                u64 *discarded_bytes)
2054 {
2055         int j, ret = 0;
2056         u64 bytes_left, end;
2057         u64 aligned_start = ALIGN(start, 1 << 9);
2058
2059         if (WARN_ON(start != aligned_start)) {
2060                 len -= aligned_start - start;
2061                 len = round_down(len, 1 << 9);
2062                 start = aligned_start;
2063         }
2064
2065         *discarded_bytes = 0;
2066
2067         if (!len)
2068                 return 0;
2069
2070         end = start + len;
2071         bytes_left = len;
2072
2073         /* Skip any superblocks on this device. */
2074         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
2075                 u64 sb_start = btrfs_sb_offset(j);
2076                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
2077                 u64 size = sb_start - start;
2078
2079                 if (!in_range(sb_start, start, bytes_left) &&
2080                     !in_range(sb_end, start, bytes_left) &&
2081                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
2082                         continue;
2083
2084                 /*
2085                  * Superblock spans beginning of range.  Adjust start and
2086                  * try again.
2087                  */
2088                 if (sb_start <= start) {
2089                         start += sb_end - start;
2090                         if (start > end) {
2091                                 bytes_left = 0;
2092                                 break;
2093                         }
2094                         bytes_left = end - start;
2095                         continue;
2096                 }
2097
2098                 if (size) {
2099                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2100                                                    GFP_NOFS, 0);
2101                         if (!ret)
2102                                 *discarded_bytes += size;
2103                         else if (ret != -EOPNOTSUPP)
2104                                 return ret;
2105                 }
2106
2107                 start = sb_end;
2108                 if (start > end) {
2109                         bytes_left = 0;
2110                         break;
2111                 }
2112                 bytes_left = end - start;
2113         }
2114
2115         if (bytes_left) {
2116                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2117                                            GFP_NOFS, 0);
2118                 if (!ret)
2119                         *discarded_bytes += bytes_left;
2120         }
2121         return ret;
2122 }
2123
2124 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
2125                          u64 num_bytes, u64 *actual_bytes)
2126 {
2127         int ret;
2128         u64 discarded_bytes = 0;
2129         struct btrfs_bio *bbio = NULL;
2130
2131
2132         /*
2133          * Avoid races with device replace and make sure our bbio has devices
2134          * associated to its stripes that don't go away while we are discarding.
2135          */
2136         btrfs_bio_counter_inc_blocked(fs_info);
2137         /* Tell the block device(s) that the sectors can be discarded */
2138         ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
2139                               &bbio, 0);
2140         /* Error condition is -ENOMEM */
2141         if (!ret) {
2142                 struct btrfs_bio_stripe *stripe = bbio->stripes;
2143                 int i;
2144
2145
2146                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2147                         u64 bytes;
2148                         if (!stripe->dev->can_discard)
2149                                 continue;
2150
2151                         ret = btrfs_issue_discard(stripe->dev->bdev,
2152                                                   stripe->physical,
2153                                                   stripe->length,
2154                                                   &bytes);
2155                         if (!ret)
2156                                 discarded_bytes += bytes;
2157                         else if (ret != -EOPNOTSUPP)
2158                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2159
2160                         /*
2161                          * Just in case we get back EOPNOTSUPP for some reason,
2162                          * just ignore the return value so we don't screw up
2163                          * people calling discard_extent.
2164                          */
2165                         ret = 0;
2166                 }
2167                 btrfs_put_bbio(bbio);
2168         }
2169         btrfs_bio_counter_dec(fs_info);
2170
2171         if (actual_bytes)
2172                 *actual_bytes = discarded_bytes;
2173
2174
2175         if (ret == -EOPNOTSUPP)
2176                 ret = 0;
2177         return ret;
2178 }
2179
2180 /* Can return -ENOMEM */
2181 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2182                          struct btrfs_root *root,
2183                          u64 bytenr, u64 num_bytes, u64 parent,
2184                          u64 root_objectid, u64 owner, u64 offset)
2185 {
2186         struct btrfs_fs_info *fs_info = root->fs_info;
2187         int old_ref_mod, new_ref_mod;
2188         int ret;
2189
2190         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2191                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2192
2193         btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2194                            owner, offset, BTRFS_ADD_DELAYED_REF);
2195
2196         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2197                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2198                                                  num_bytes, parent,
2199                                                  root_objectid, (int)owner,
2200                                                  BTRFS_ADD_DELAYED_REF, NULL,
2201                                                  &old_ref_mod, &new_ref_mod);
2202         } else {
2203                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2204                                                  num_bytes, parent,
2205                                                  root_objectid, owner, offset,
2206                                                  0, BTRFS_ADD_DELAYED_REF,
2207                                                  &old_ref_mod, &new_ref_mod);
2208         }
2209
2210         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2211                 add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
2212
2213         return ret;
2214 }
2215
2216 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2217                                   struct btrfs_fs_info *fs_info,
2218                                   struct btrfs_delayed_ref_node *node,
2219                                   u64 parent, u64 root_objectid,
2220                                   u64 owner, u64 offset, int refs_to_add,
2221                                   struct btrfs_delayed_extent_op *extent_op)
2222 {
2223         struct btrfs_path *path;
2224         struct extent_buffer *leaf;
2225         struct btrfs_extent_item *item;
2226         struct btrfs_key key;
2227         u64 bytenr = node->bytenr;
2228         u64 num_bytes = node->num_bytes;
2229         u64 refs;
2230         int ret;
2231
2232         path = btrfs_alloc_path();
2233         if (!path)
2234                 return -ENOMEM;
2235
2236         path->reada = READA_FORWARD;
2237         path->leave_spinning = 1;
2238         /* this will setup the path even if it fails to insert the back ref */
2239         ret = insert_inline_extent_backref(trans, fs_info, path, bytenr,
2240                                            num_bytes, parent, root_objectid,
2241                                            owner, offset,
2242                                            refs_to_add, extent_op);
2243         if ((ret < 0 && ret != -EAGAIN) || !ret)
2244                 goto out;
2245
2246         /*
2247          * Ok we had -EAGAIN which means we didn't have space to insert and
2248          * inline extent ref, so just update the reference count and add a
2249          * normal backref.
2250          */
2251         leaf = path->nodes[0];
2252         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2253         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2254         refs = btrfs_extent_refs(leaf, item);
2255         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2256         if (extent_op)
2257                 __run_delayed_extent_op(extent_op, leaf, item);
2258
2259         btrfs_mark_buffer_dirty(leaf);
2260         btrfs_release_path(path);
2261
2262         path->reada = READA_FORWARD;
2263         path->leave_spinning = 1;
2264         /* now insert the actual backref */
2265         ret = insert_extent_backref(trans, fs_info, path, bytenr, parent,
2266                                     root_objectid, owner, offset, refs_to_add);
2267         if (ret)
2268                 btrfs_abort_transaction(trans, ret);
2269 out:
2270         btrfs_free_path(path);
2271         return ret;
2272 }
2273
2274 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2275                                 struct btrfs_fs_info *fs_info,
2276                                 struct btrfs_delayed_ref_node *node,
2277                                 struct btrfs_delayed_extent_op *extent_op,
2278                                 int insert_reserved)
2279 {
2280         int ret = 0;
2281         struct btrfs_delayed_data_ref *ref;
2282         struct btrfs_key ins;
2283         u64 parent = 0;
2284         u64 ref_root = 0;
2285         u64 flags = 0;
2286
2287         ins.objectid = node->bytenr;
2288         ins.offset = node->num_bytes;
2289         ins.type = BTRFS_EXTENT_ITEM_KEY;
2290
2291         ref = btrfs_delayed_node_to_data_ref(node);
2292         trace_run_delayed_data_ref(fs_info, node, ref, node->action);
2293
2294         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2295                 parent = ref->parent;
2296         ref_root = ref->root;
2297
2298         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2299                 if (extent_op)
2300                         flags |= extent_op->flags_to_set;
2301                 ret = alloc_reserved_file_extent(trans, fs_info,
2302                                                  parent, ref_root, flags,
2303                                                  ref->objectid, ref->offset,
2304                                                  &ins, node->ref_mod);
2305         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2306                 ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent,
2307                                              ref_root, ref->objectid,
2308                                              ref->offset, node->ref_mod,
2309                                              extent_op);
2310         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2311                 ret = __btrfs_free_extent(trans, fs_info, node, parent,
2312                                           ref_root, ref->objectid,
2313                                           ref->offset, node->ref_mod,
2314                                           extent_op);
2315         } else {
2316                 BUG();
2317         }
2318         return ret;
2319 }
2320
2321 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2322                                     struct extent_buffer *leaf,
2323                                     struct btrfs_extent_item *ei)
2324 {
2325         u64 flags = btrfs_extent_flags(leaf, ei);
2326         if (extent_op->update_flags) {
2327                 flags |= extent_op->flags_to_set;
2328                 btrfs_set_extent_flags(leaf, ei, flags);
2329         }
2330
2331         if (extent_op->update_key) {
2332                 struct btrfs_tree_block_info *bi;
2333                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2334                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2335                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2336         }
2337 }
2338
2339 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2340                                  struct btrfs_fs_info *fs_info,
2341                                  struct btrfs_delayed_ref_head *head,
2342                                  struct btrfs_delayed_extent_op *extent_op)
2343 {
2344         struct btrfs_key key;
2345         struct btrfs_path *path;
2346         struct btrfs_extent_item *ei;
2347         struct extent_buffer *leaf;
2348         u32 item_size;
2349         int ret;
2350         int err = 0;
2351         int metadata = !extent_op->is_data;
2352
2353         if (trans->aborted)
2354                 return 0;
2355
2356         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2357                 metadata = 0;
2358
2359         path = btrfs_alloc_path();
2360         if (!path)
2361                 return -ENOMEM;
2362
2363         key.objectid = head->bytenr;
2364
2365         if (metadata) {
2366                 key.type = BTRFS_METADATA_ITEM_KEY;
2367                 key.offset = extent_op->level;
2368         } else {
2369                 key.type = BTRFS_EXTENT_ITEM_KEY;
2370                 key.offset = head->num_bytes;
2371         }
2372
2373 again:
2374         path->reada = READA_FORWARD;
2375         path->leave_spinning = 1;
2376         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2377         if (ret < 0) {
2378                 err = ret;
2379                 goto out;
2380         }
2381         if (ret > 0) {
2382                 if (metadata) {
2383                         if (path->slots[0] > 0) {
2384                                 path->slots[0]--;
2385                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2386                                                       path->slots[0]);
2387                                 if (key.objectid == head->bytenr &&
2388                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2389                                     key.offset == head->num_bytes)
2390                                         ret = 0;
2391                         }
2392                         if (ret > 0) {
2393                                 btrfs_release_path(path);
2394                                 metadata = 0;
2395
2396                                 key.objectid = head->bytenr;
2397                                 key.offset = head->num_bytes;
2398                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2399                                 goto again;
2400                         }
2401                 } else {
2402                         err = -EIO;
2403                         goto out;
2404                 }
2405         }
2406
2407         leaf = path->nodes[0];
2408         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2409 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2410         if (item_size < sizeof(*ei)) {
2411                 ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0);
2412                 if (ret < 0) {
2413                         err = ret;
2414                         goto out;
2415                 }
2416                 leaf = path->nodes[0];
2417                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2418         }
2419 #endif
2420         BUG_ON(item_size < sizeof(*ei));
2421         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2422         __run_delayed_extent_op(extent_op, leaf, ei);
2423
2424         btrfs_mark_buffer_dirty(leaf);
2425 out:
2426         btrfs_free_path(path);
2427         return err;
2428 }
2429
2430 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2431                                 struct btrfs_fs_info *fs_info,
2432                                 struct btrfs_delayed_ref_node *node,
2433                                 struct btrfs_delayed_extent_op *extent_op,
2434                                 int insert_reserved)
2435 {
2436         int ret = 0;
2437         struct btrfs_delayed_tree_ref *ref;
2438         struct btrfs_key ins;
2439         u64 parent = 0;
2440         u64 ref_root = 0;
2441         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
2442
2443         ref = btrfs_delayed_node_to_tree_ref(node);
2444         trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
2445
2446         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2447                 parent = ref->parent;
2448         ref_root = ref->root;
2449
2450         ins.objectid = node->bytenr;
2451         if (skinny_metadata) {
2452                 ins.offset = ref->level;
2453                 ins.type = BTRFS_METADATA_ITEM_KEY;
2454         } else {
2455                 ins.offset = node->num_bytes;
2456                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2457         }
2458
2459         if (node->ref_mod != 1) {
2460                 btrfs_err(fs_info,
2461         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2462                           node->bytenr, node->ref_mod, node->action, ref_root,
2463                           parent);
2464                 return -EIO;
2465         }
2466         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2467                 BUG_ON(!extent_op || !extent_op->update_flags);
2468                 ret = alloc_reserved_tree_block(trans, fs_info,
2469                                                 parent, ref_root,
2470                                                 extent_op->flags_to_set,
2471                                                 &extent_op->key,
2472                                                 ref->level, &ins);
2473         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2474                 ret = __btrfs_inc_extent_ref(trans, fs_info, node,
2475                                              parent, ref_root,
2476                                              ref->level, 0, 1,
2477                                              extent_op);
2478         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2479                 ret = __btrfs_free_extent(trans, fs_info, node,
2480                                           parent, ref_root,
2481                                           ref->level, 0, 1, extent_op);
2482         } else {
2483                 BUG();
2484         }
2485         return ret;
2486 }
2487
2488 /* helper function to actually process a single delayed ref entry */
2489 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2490                                struct btrfs_fs_info *fs_info,
2491                                struct btrfs_delayed_ref_node *node,
2492                                struct btrfs_delayed_extent_op *extent_op,
2493                                int insert_reserved)
2494 {
2495         int ret = 0;
2496
2497         if (trans->aborted) {
2498                 if (insert_reserved)
2499                         btrfs_pin_extent(fs_info, node->bytenr,
2500                                          node->num_bytes, 1);
2501                 return 0;
2502         }
2503
2504         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2505             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2506                 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
2507                                            insert_reserved);
2508         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2509                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2510                 ret = run_delayed_data_ref(trans, fs_info, node, extent_op,
2511                                            insert_reserved);
2512         else
2513                 BUG();
2514         return ret;
2515 }
2516
2517 static inline struct btrfs_delayed_ref_node *
2518 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2519 {
2520         struct btrfs_delayed_ref_node *ref;
2521
2522         if (RB_EMPTY_ROOT(&head->ref_tree))
2523                 return NULL;
2524
2525         /*
2526          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2527          * This is to prevent a ref count from going down to zero, which deletes
2528          * the extent item from the extent tree, when there still are references
2529          * to add, which would fail because they would not find the extent item.
2530          */
2531         if (!list_empty(&head->ref_add_list))
2532                 return list_first_entry(&head->ref_add_list,
2533                                 struct btrfs_delayed_ref_node, add_list);
2534
2535         ref = rb_entry(rb_first(&head->ref_tree),
2536                        struct btrfs_delayed_ref_node, ref_node);
2537         ASSERT(list_empty(&ref->add_list));
2538         return ref;
2539 }
2540
2541 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2542                                       struct btrfs_delayed_ref_head *head)
2543 {
2544         spin_lock(&delayed_refs->lock);
2545         head->processing = 0;
2546         delayed_refs->num_heads_ready++;
2547         spin_unlock(&delayed_refs->lock);
2548         btrfs_delayed_ref_unlock(head);
2549 }
2550
2551 static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2552                              struct btrfs_fs_info *fs_info,
2553                              struct btrfs_delayed_ref_head *head)
2554 {
2555         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2556         int ret;
2557
2558         if (!extent_op)
2559                 return 0;
2560         head->extent_op = NULL;
2561         if (head->must_insert_reserved) {
2562                 btrfs_free_delayed_extent_op(extent_op);
2563                 return 0;
2564         }
2565         spin_unlock(&head->lock);
2566         ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
2567         btrfs_free_delayed_extent_op(extent_op);
2568         return ret ? ret : 1;
2569 }
2570
2571 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2572                             struct btrfs_fs_info *fs_info,
2573                             struct btrfs_delayed_ref_head *head)
2574 {
2575         struct btrfs_delayed_ref_root *delayed_refs;
2576         int ret;
2577
2578         delayed_refs = &trans->transaction->delayed_refs;
2579
2580         ret = cleanup_extent_op(trans, fs_info, head);
2581         if (ret < 0) {
2582                 unselect_delayed_ref_head(delayed_refs, head);
2583                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2584                 return ret;
2585         } else if (ret) {
2586                 return ret;
2587         }
2588
2589         /*
2590          * Need to drop our head ref lock and re-acquire the delayed ref lock
2591          * and then re-check to make sure nobody got added.
2592          */
2593         spin_unlock(&head->lock);
2594         spin_lock(&delayed_refs->lock);
2595         spin_lock(&head->lock);
2596         if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
2597                 spin_unlock(&head->lock);
2598                 spin_unlock(&delayed_refs->lock);
2599                 return 1;
2600         }
2601         delayed_refs->num_heads--;
2602         rb_erase(&head->href_node, &delayed_refs->href_root);
2603         RB_CLEAR_NODE(&head->href_node);
2604         spin_unlock(&delayed_refs->lock);
2605         spin_unlock(&head->lock);
2606         atomic_dec(&delayed_refs->num_entries);
2607
2608         trace_run_delayed_ref_head(fs_info, head, 0);
2609
2610         if (head->total_ref_mod < 0) {
2611                 struct btrfs_block_group_cache *cache;
2612
2613                 cache = btrfs_lookup_block_group(fs_info, head->bytenr);
2614                 ASSERT(cache);
2615                 percpu_counter_add(&cache->space_info->total_bytes_pinned,
2616                                    -head->num_bytes);
2617                 btrfs_put_block_group(cache);
2618
2619                 if (head->is_data) {
2620                         spin_lock(&delayed_refs->lock);
2621                         delayed_refs->pending_csums -= head->num_bytes;
2622                         spin_unlock(&delayed_refs->lock);
2623                 }
2624         }
2625
2626         if (head->must_insert_reserved) {
2627                 btrfs_pin_extent(fs_info, head->bytenr,
2628                                  head->num_bytes, 1);
2629                 if (head->is_data) {
2630                         ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2631                                               head->num_bytes);
2632                 }
2633         }
2634
2635         /* Also free its reserved qgroup space */
2636         btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2637                                       head->qgroup_reserved);
2638         btrfs_delayed_ref_unlock(head);
2639         btrfs_put_delayed_ref_head(head);
2640         return 0;
2641 }
2642
2643 /*
2644  * Returns 0 on success or if called with an already aborted transaction.
2645  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2646  */
2647 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2648                                              struct btrfs_fs_info *fs_info,
2649                                              unsigned long nr)
2650 {
2651         struct btrfs_delayed_ref_root *delayed_refs;
2652         struct btrfs_delayed_ref_node *ref;
2653         struct btrfs_delayed_ref_head *locked_ref = NULL;
2654         struct btrfs_delayed_extent_op *extent_op;
2655         ktime_t start = ktime_get();
2656         int ret;
2657         unsigned long count = 0;
2658         unsigned long actual_count = 0;
2659         int must_insert_reserved = 0;
2660
2661         delayed_refs = &trans->transaction->delayed_refs;
2662         while (1) {
2663                 if (!locked_ref) {
2664                         if (count >= nr)
2665                                 break;
2666
2667                         spin_lock(&delayed_refs->lock);
2668                         locked_ref = btrfs_select_ref_head(trans);
2669                         if (!locked_ref) {
2670                                 spin_unlock(&delayed_refs->lock);
2671                                 break;
2672                         }
2673
2674                         /* grab the lock that says we are going to process
2675                          * all the refs for this head */
2676                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2677                         spin_unlock(&delayed_refs->lock);
2678                         /*
2679                          * we may have dropped the spin lock to get the head
2680                          * mutex lock, and that might have given someone else
2681                          * time to free the head.  If that's true, it has been
2682                          * removed from our list and we can move on.
2683                          */
2684                         if (ret == -EAGAIN) {
2685                                 locked_ref = NULL;
2686                                 count++;
2687                                 continue;
2688                         }
2689                 }
2690
2691                 /*
2692                  * We need to try and merge add/drops of the same ref since we
2693                  * can run into issues with relocate dropping the implicit ref
2694                  * and then it being added back again before the drop can
2695                  * finish.  If we merged anything we need to re-loop so we can
2696                  * get a good ref.
2697                  * Or we can get node references of the same type that weren't
2698                  * merged when created due to bumps in the tree mod seq, and
2699                  * we need to merge them to prevent adding an inline extent
2700                  * backref before dropping it (triggering a BUG_ON at
2701                  * insert_inline_extent_backref()).
2702                  */
2703                 spin_lock(&locked_ref->lock);
2704                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2705                                          locked_ref);
2706
2707                 /*
2708                  * locked_ref is the head node, so we have to go one
2709                  * node back for any delayed ref updates
2710                  */
2711                 ref = select_delayed_ref(locked_ref);
2712
2713                 if (ref && ref->seq &&
2714                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2715                         spin_unlock(&locked_ref->lock);
2716                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2717                         locked_ref = NULL;
2718                         cond_resched();
2719                         count++;
2720                         continue;
2721                 }
2722
2723                 /*
2724                  * We're done processing refs in this ref_head, clean everything
2725                  * up and move on to the next ref_head.
2726                  */
2727                 if (!ref) {
2728                         ret = cleanup_ref_head(trans, fs_info, locked_ref);
2729                         if (ret > 0 ) {
2730                                 /* We dropped our lock, we need to loop. */
2731                                 ret = 0;
2732                                 continue;
2733                         } else if (ret) {
2734                                 return ret;
2735                         }
2736                         locked_ref = NULL;
2737                         count++;
2738                         continue;
2739                 }
2740
2741                 actual_count++;
2742                 ref->in_tree = 0;
2743                 rb_erase(&ref->ref_node, &locked_ref->ref_tree);
2744                 RB_CLEAR_NODE(&ref->ref_node);
2745                 if (!list_empty(&ref->add_list))
2746                         list_del(&ref->add_list);
2747                 /*
2748                  * When we play the delayed ref, also correct the ref_mod on
2749                  * head
2750                  */
2751                 switch (ref->action) {
2752                 case BTRFS_ADD_DELAYED_REF:
2753                 case BTRFS_ADD_DELAYED_EXTENT:
2754                         locked_ref->ref_mod -= ref->ref_mod;
2755                         break;
2756                 case BTRFS_DROP_DELAYED_REF:
2757                         locked_ref->ref_mod += ref->ref_mod;
2758                         break;
2759                 default:
2760                         WARN_ON(1);
2761                 }
2762                 atomic_dec(&delayed_refs->num_entries);
2763
2764                 /*
2765                  * Record the must-insert_reserved flag before we drop the spin
2766                  * lock.
2767                  */
2768                 must_insert_reserved = locked_ref->must_insert_reserved;
2769                 locked_ref->must_insert_reserved = 0;
2770
2771                 extent_op = locked_ref->extent_op;
2772                 locked_ref->extent_op = NULL;
2773                 spin_unlock(&locked_ref->lock);
2774
2775                 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
2776                                           must_insert_reserved);
2777
2778                 btrfs_free_delayed_extent_op(extent_op);
2779                 if (ret) {
2780                         unselect_delayed_ref_head(delayed_refs, locked_ref);
2781                         btrfs_put_delayed_ref(ref);
2782                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2783                                     ret);
2784                         return ret;
2785                 }
2786
2787                 btrfs_put_delayed_ref(ref);
2788                 count++;
2789                 cond_resched();
2790         }
2791
2792         /*
2793          * We don't want to include ref heads since we can have empty ref heads
2794          * and those will drastically skew our runtime down since we just do
2795          * accounting, no actual extent tree updates.
2796          */
2797         if (actual_count > 0) {
2798                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2799                 u64 avg;
2800
2801                 /*
2802                  * We weigh the current average higher than our current runtime
2803                  * to avoid large swings in the average.
2804                  */
2805                 spin_lock(&delayed_refs->lock);
2806                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2807                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2808                 spin_unlock(&delayed_refs->lock);
2809         }
2810         return 0;
2811 }
2812
2813 #ifdef SCRAMBLE_DELAYED_REFS
2814 /*
2815  * Normally delayed refs get processed in ascending bytenr order. This
2816  * correlates in most cases to the order added. To expose dependencies on this
2817  * order, we start to process the tree in the middle instead of the beginning
2818  */
2819 static u64 find_middle(struct rb_root *root)
2820 {
2821         struct rb_node *n = root->rb_node;
2822         struct btrfs_delayed_ref_node *entry;
2823         int alt = 1;
2824         u64 middle;
2825         u64 first = 0, last = 0;
2826
2827         n = rb_first(root);
2828         if (n) {
2829                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2830                 first = entry->bytenr;
2831         }
2832         n = rb_last(root);
2833         if (n) {
2834                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2835                 last = entry->bytenr;
2836         }
2837         n = root->rb_node;
2838
2839         while (n) {
2840                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2841                 WARN_ON(!entry->in_tree);
2842
2843                 middle = entry->bytenr;
2844
2845                 if (alt)
2846                         n = n->rb_left;
2847                 else
2848                         n = n->rb_right;
2849
2850                 alt = 1 - alt;
2851         }
2852         return middle;
2853 }
2854 #endif
2855
2856 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2857 {
2858         u64 num_bytes;
2859
2860         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2861                              sizeof(struct btrfs_extent_inline_ref));
2862         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2863                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2864
2865         /*
2866          * We don't ever fill up leaves all the way so multiply by 2 just to be
2867          * closer to what we're really going to want to use.
2868          */
2869         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2870 }
2871
2872 /*
2873  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2874  * would require to store the csums for that many bytes.
2875  */
2876 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2877 {
2878         u64 csum_size;
2879         u64 num_csums_per_leaf;
2880         u64 num_csums;
2881
2882         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2883         num_csums_per_leaf = div64_u64(csum_size,
2884                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2885         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2886         num_csums += num_csums_per_leaf - 1;
2887         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2888         return num_csums;
2889 }
2890
2891 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2892                                        struct btrfs_fs_info *fs_info)
2893 {
2894         struct btrfs_block_rsv *global_rsv;
2895         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2896         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2897         unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
2898         u64 num_bytes, num_dirty_bgs_bytes;
2899         int ret = 0;
2900
2901         num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2902         num_heads = heads_to_leaves(fs_info, num_heads);
2903         if (num_heads > 1)
2904                 num_bytes += (num_heads - 1) * fs_info->nodesize;
2905         num_bytes <<= 1;
2906         num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2907                                                         fs_info->nodesize;
2908         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2909                                                              num_dirty_bgs);
2910         global_rsv = &fs_info->global_block_rsv;
2911
2912         /*
2913          * If we can't allocate any more chunks lets make sure we have _lots_ of
2914          * wiggle room since running delayed refs can create more delayed refs.
2915          */
2916         if (global_rsv->space_info->full) {
2917                 num_dirty_bgs_bytes <<= 1;
2918                 num_bytes <<= 1;
2919         }
2920
2921         spin_lock(&global_rsv->lock);
2922         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2923                 ret = 1;
2924         spin_unlock(&global_rsv->lock);
2925         return ret;
2926 }
2927
2928 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2929                                        struct btrfs_fs_info *fs_info)
2930 {
2931         u64 num_entries =
2932                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2933         u64 avg_runtime;
2934         u64 val;
2935
2936         smp_mb();
2937         avg_runtime = fs_info->avg_delayed_ref_runtime;
2938         val = num_entries * avg_runtime;
2939         if (val >= NSEC_PER_SEC)
2940                 return 1;
2941         if (val >= NSEC_PER_SEC / 2)
2942                 return 2;
2943
2944         return btrfs_check_space_for_delayed_refs(trans, fs_info);
2945 }
2946
2947 struct async_delayed_refs {
2948         struct btrfs_root *root;
2949         u64 transid;
2950         int count;
2951         int error;
2952         int sync;
2953         struct completion wait;
2954         struct btrfs_work work;
2955 };
2956
2957 static inline struct async_delayed_refs *
2958 to_async_delayed_refs(struct btrfs_work *work)
2959 {
2960         return container_of(work, struct async_delayed_refs, work);
2961 }
2962
2963 static void delayed_ref_async_start(struct btrfs_work *work)
2964 {
2965         struct async_delayed_refs *async = to_async_delayed_refs(work);
2966         struct btrfs_trans_handle *trans;
2967         struct btrfs_fs_info *fs_info = async->root->fs_info;
2968         int ret;
2969
2970         /* if the commit is already started, we don't need to wait here */
2971         if (btrfs_transaction_blocked(fs_info))
2972                 goto done;
2973
2974         trans = btrfs_join_transaction(async->root);
2975         if (IS_ERR(trans)) {
2976                 async->error = PTR_ERR(trans);
2977                 goto done;
2978         }
2979
2980         /*
2981          * trans->sync means that when we call end_transaction, we won't
2982          * wait on delayed refs
2983          */
2984         trans->sync = true;
2985
2986         /* Don't bother flushing if we got into a different transaction */
2987         if (trans->transid > async->transid)
2988                 goto end;
2989
2990         ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
2991         if (ret)
2992                 async->error = ret;
2993 end:
2994         ret = btrfs_end_transaction(trans);
2995         if (ret && !async->error)
2996                 async->error = ret;
2997 done:
2998         if (async->sync)
2999                 complete(&async->wait);
3000         else
3001                 kfree(async);
3002 }
3003
3004 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
3005                                  unsigned long count, u64 transid, int wait)
3006 {
3007         struct async_delayed_refs *async;
3008         int ret;
3009
3010         async = kmalloc(sizeof(*async), GFP_NOFS);
3011         if (!async)
3012                 return -ENOMEM;
3013
3014         async->root = fs_info->tree_root;
3015         async->count = count;
3016         async->error = 0;
3017         async->transid = transid;
3018         if (wait)
3019                 async->sync = 1;
3020         else
3021                 async->sync = 0;
3022         init_completion(&async->wait);
3023
3024         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
3025                         delayed_ref_async_start, NULL, NULL);
3026
3027         btrfs_queue_work(fs_info->extent_workers, &async->work);
3028
3029         if (wait) {
3030                 wait_for_completion(&async->wait);
3031                 ret = async->error;
3032                 kfree(async);
3033                 return ret;
3034         }
3035         return 0;
3036 }
3037
3038 /*
3039  * this starts processing the delayed reference count updates and
3040  * extent insertions we have queued up so far.  count can be
3041  * 0, which means to process everything in the tree at the start
3042  * of the run (but not newly added entries), or it can be some target
3043  * number you'd like to process.
3044  *
3045  * Returns 0 on success or if called with an aborted transaction
3046  * Returns <0 on error and aborts the transaction
3047  */
3048 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3049                            struct btrfs_fs_info *fs_info, unsigned long count)
3050 {
3051         struct rb_node *node;
3052         struct btrfs_delayed_ref_root *delayed_refs;
3053         struct btrfs_delayed_ref_head *head;
3054         int ret;
3055         int run_all = count == (unsigned long)-1;
3056         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
3057
3058         /* We'll clean this up in btrfs_cleanup_transaction */
3059         if (trans->aborted)