6dfa92e5ceed239a7e1928ad55dc3f31eb50fdf5
[sfrench/cifs-2.6.git] / fs / btrfs / relocation.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/pagemap.h>
8 #include <linux/writeback.h>
9 #include <linux/blkdev.h>
10 #include <linux/rbtree.h>
11 #include <linux/slab.h>
12 #include "ctree.h"
13 #include "disk-io.h"
14 #include "transaction.h"
15 #include "volumes.h"
16 #include "locking.h"
17 #include "btrfs_inode.h"
18 #include "async-thread.h"
19 #include "free-space-cache.h"
20 #include "inode-map.h"
21 #include "qgroup.h"
22 #include "print-tree.h"
23
24 /*
25  * backref_node, mapping_node and tree_block start with this
26  */
27 struct tree_entry {
28         struct rb_node rb_node;
29         u64 bytenr;
30 };
31
32 /*
33  * present a tree block in the backref cache
34  */
35 struct backref_node {
36         struct rb_node rb_node;
37         u64 bytenr;
38
39         u64 new_bytenr;
40         /* objectid of tree block owner, can be not uptodate */
41         u64 owner;
42         /* link to pending, changed or detached list */
43         struct list_head list;
44         /* list of upper level blocks reference this block */
45         struct list_head upper;
46         /* list of child blocks in the cache */
47         struct list_head lower;
48         /* NULL if this node is not tree root */
49         struct btrfs_root *root;
50         /* extent buffer got by COW the block */
51         struct extent_buffer *eb;
52         /* level of tree block */
53         unsigned int level:8;
54         /* is the block in non-reference counted tree */
55         unsigned int cowonly:1;
56         /* 1 if no child node in the cache */
57         unsigned int lowest:1;
58         /* is the extent buffer locked */
59         unsigned int locked:1;
60         /* has the block been processed */
61         unsigned int processed:1;
62         /* have backrefs of this block been checked */
63         unsigned int checked:1;
64         /*
65          * 1 if corresponding block has been cowed but some upper
66          * level block pointers may not point to the new location
67          */
68         unsigned int pending:1;
69         /*
70          * 1 if the backref node isn't connected to any other
71          * backref node.
72          */
73         unsigned int detached:1;
74 };
75
76 /*
77  * present a block pointer in the backref cache
78  */
79 struct backref_edge {
80         struct list_head list[2];
81         struct backref_node *node[2];
82 };
83
84 #define LOWER   0
85 #define UPPER   1
86 #define RELOCATION_RESERVED_NODES       256
87
88 struct backref_cache {
89         /* red black tree of all backref nodes in the cache */
90         struct rb_root rb_root;
91         /* for passing backref nodes to btrfs_reloc_cow_block */
92         struct backref_node *path[BTRFS_MAX_LEVEL];
93         /*
94          * list of blocks that have been cowed but some block
95          * pointers in upper level blocks may not reflect the
96          * new location
97          */
98         struct list_head pending[BTRFS_MAX_LEVEL];
99         /* list of backref nodes with no child node */
100         struct list_head leaves;
101         /* list of blocks that have been cowed in current transaction */
102         struct list_head changed;
103         /* list of detached backref node. */
104         struct list_head detached;
105
106         u64 last_trans;
107
108         int nr_nodes;
109         int nr_edges;
110 };
111
112 /*
113  * map address of tree root to tree
114  */
115 struct mapping_node {
116         struct rb_node rb_node;
117         u64 bytenr;
118         void *data;
119 };
120
121 struct mapping_tree {
122         struct rb_root rb_root;
123         spinlock_t lock;
124 };
125
126 /*
127  * present a tree block to process
128  */
129 struct tree_block {
130         struct rb_node rb_node;
131         u64 bytenr;
132         struct btrfs_key key;
133         unsigned int level:8;
134         unsigned int key_ready:1;
135 };
136
137 #define MAX_EXTENTS 128
138
139 struct file_extent_cluster {
140         u64 start;
141         u64 end;
142         u64 boundary[MAX_EXTENTS];
143         unsigned int nr;
144 };
145
146 struct reloc_control {
147         /* block group to relocate */
148         struct btrfs_block_group_cache *block_group;
149         /* extent tree */
150         struct btrfs_root *extent_root;
151         /* inode for moving data */
152         struct inode *data_inode;
153
154         struct btrfs_block_rsv *block_rsv;
155
156         struct backref_cache backref_cache;
157
158         struct file_extent_cluster cluster;
159         /* tree blocks have been processed */
160         struct extent_io_tree processed_blocks;
161         /* map start of tree root to corresponding reloc tree */
162         struct mapping_tree reloc_root_tree;
163         /* list of reloc trees */
164         struct list_head reloc_roots;
165         /* size of metadata reservation for merging reloc trees */
166         u64 merging_rsv_size;
167         /* size of relocated tree nodes */
168         u64 nodes_relocated;
169         /* reserved size for block group relocation*/
170         u64 reserved_bytes;
171
172         u64 search_start;
173         u64 extents_found;
174
175         unsigned int stage:8;
176         unsigned int create_reloc_tree:1;
177         unsigned int merge_reloc_tree:1;
178         unsigned int found_file_extent:1;
179 };
180
181 /* stages of data relocation */
182 #define MOVE_DATA_EXTENTS       0
183 #define UPDATE_DATA_PTRS        1
184
185 static void remove_backref_node(struct backref_cache *cache,
186                                 struct backref_node *node);
187 static void __mark_block_processed(struct reloc_control *rc,
188                                    struct backref_node *node);
189
190 static void mapping_tree_init(struct mapping_tree *tree)
191 {
192         tree->rb_root = RB_ROOT;
193         spin_lock_init(&tree->lock);
194 }
195
196 static void backref_cache_init(struct backref_cache *cache)
197 {
198         int i;
199         cache->rb_root = RB_ROOT;
200         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
201                 INIT_LIST_HEAD(&cache->pending[i]);
202         INIT_LIST_HEAD(&cache->changed);
203         INIT_LIST_HEAD(&cache->detached);
204         INIT_LIST_HEAD(&cache->leaves);
205 }
206
207 static void backref_cache_cleanup(struct backref_cache *cache)
208 {
209         struct backref_node *node;
210         int i;
211
212         while (!list_empty(&cache->detached)) {
213                 node = list_entry(cache->detached.next,
214                                   struct backref_node, list);
215                 remove_backref_node(cache, node);
216         }
217
218         while (!list_empty(&cache->leaves)) {
219                 node = list_entry(cache->leaves.next,
220                                   struct backref_node, lower);
221                 remove_backref_node(cache, node);
222         }
223
224         cache->last_trans = 0;
225
226         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
227                 ASSERT(list_empty(&cache->pending[i]));
228         ASSERT(list_empty(&cache->changed));
229         ASSERT(list_empty(&cache->detached));
230         ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
231         ASSERT(!cache->nr_nodes);
232         ASSERT(!cache->nr_edges);
233 }
234
235 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
236 {
237         struct backref_node *node;
238
239         node = kzalloc(sizeof(*node), GFP_NOFS);
240         if (node) {
241                 INIT_LIST_HEAD(&node->list);
242                 INIT_LIST_HEAD(&node->upper);
243                 INIT_LIST_HEAD(&node->lower);
244                 RB_CLEAR_NODE(&node->rb_node);
245                 cache->nr_nodes++;
246         }
247         return node;
248 }
249
250 static void free_backref_node(struct backref_cache *cache,
251                               struct backref_node *node)
252 {
253         if (node) {
254                 cache->nr_nodes--;
255                 kfree(node);
256         }
257 }
258
259 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
260 {
261         struct backref_edge *edge;
262
263         edge = kzalloc(sizeof(*edge), GFP_NOFS);
264         if (edge)
265                 cache->nr_edges++;
266         return edge;
267 }
268
269 static void free_backref_edge(struct backref_cache *cache,
270                               struct backref_edge *edge)
271 {
272         if (edge) {
273                 cache->nr_edges--;
274                 kfree(edge);
275         }
276 }
277
278 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
279                                    struct rb_node *node)
280 {
281         struct rb_node **p = &root->rb_node;
282         struct rb_node *parent = NULL;
283         struct tree_entry *entry;
284
285         while (*p) {
286                 parent = *p;
287                 entry = rb_entry(parent, struct tree_entry, rb_node);
288
289                 if (bytenr < entry->bytenr)
290                         p = &(*p)->rb_left;
291                 else if (bytenr > entry->bytenr)
292                         p = &(*p)->rb_right;
293                 else
294                         return parent;
295         }
296
297         rb_link_node(node, parent, p);
298         rb_insert_color(node, root);
299         return NULL;
300 }
301
302 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
303 {
304         struct rb_node *n = root->rb_node;
305         struct tree_entry *entry;
306
307         while (n) {
308                 entry = rb_entry(n, struct tree_entry, rb_node);
309
310                 if (bytenr < entry->bytenr)
311                         n = n->rb_left;
312                 else if (bytenr > entry->bytenr)
313                         n = n->rb_right;
314                 else
315                         return n;
316         }
317         return NULL;
318 }
319
320 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
321 {
322
323         struct btrfs_fs_info *fs_info = NULL;
324         struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
325                                               rb_node);
326         if (bnode->root)
327                 fs_info = bnode->root->fs_info;
328         btrfs_panic(fs_info, errno,
329                     "Inconsistency in backref cache found at offset %llu",
330                     bytenr);
331 }
332
333 /*
334  * walk up backref nodes until reach node presents tree root
335  */
336 static struct backref_node *walk_up_backref(struct backref_node *node,
337                                             struct backref_edge *edges[],
338                                             int *index)
339 {
340         struct backref_edge *edge;
341         int idx = *index;
342
343         while (!list_empty(&node->upper)) {
344                 edge = list_entry(node->upper.next,
345                                   struct backref_edge, list[LOWER]);
346                 edges[idx++] = edge;
347                 node = edge->node[UPPER];
348         }
349         BUG_ON(node->detached);
350         *index = idx;
351         return node;
352 }
353
354 /*
355  * walk down backref nodes to find start of next reference path
356  */
357 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
358                                               int *index)
359 {
360         struct backref_edge *edge;
361         struct backref_node *lower;
362         int idx = *index;
363
364         while (idx > 0) {
365                 edge = edges[idx - 1];
366                 lower = edge->node[LOWER];
367                 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
368                         idx--;
369                         continue;
370                 }
371                 edge = list_entry(edge->list[LOWER].next,
372                                   struct backref_edge, list[LOWER]);
373                 edges[idx - 1] = edge;
374                 *index = idx;
375                 return edge->node[UPPER];
376         }
377         *index = 0;
378         return NULL;
379 }
380
381 static void unlock_node_buffer(struct backref_node *node)
382 {
383         if (node->locked) {
384                 btrfs_tree_unlock(node->eb);
385                 node->locked = 0;
386         }
387 }
388
389 static void drop_node_buffer(struct backref_node *node)
390 {
391         if (node->eb) {
392                 unlock_node_buffer(node);
393                 free_extent_buffer(node->eb);
394                 node->eb = NULL;
395         }
396 }
397
398 static void drop_backref_node(struct backref_cache *tree,
399                               struct backref_node *node)
400 {
401         BUG_ON(!list_empty(&node->upper));
402
403         drop_node_buffer(node);
404         list_del(&node->list);
405         list_del(&node->lower);
406         if (!RB_EMPTY_NODE(&node->rb_node))
407                 rb_erase(&node->rb_node, &tree->rb_root);
408         free_backref_node(tree, node);
409 }
410
411 /*
412  * remove a backref node from the backref cache
413  */
414 static void remove_backref_node(struct backref_cache *cache,
415                                 struct backref_node *node)
416 {
417         struct backref_node *upper;
418         struct backref_edge *edge;
419
420         if (!node)
421                 return;
422
423         BUG_ON(!node->lowest && !node->detached);
424         while (!list_empty(&node->upper)) {
425                 edge = list_entry(node->upper.next, struct backref_edge,
426                                   list[LOWER]);
427                 upper = edge->node[UPPER];
428                 list_del(&edge->list[LOWER]);
429                 list_del(&edge->list[UPPER]);
430                 free_backref_edge(cache, edge);
431
432                 if (RB_EMPTY_NODE(&upper->rb_node)) {
433                         BUG_ON(!list_empty(&node->upper));
434                         drop_backref_node(cache, node);
435                         node = upper;
436                         node->lowest = 1;
437                         continue;
438                 }
439                 /*
440                  * add the node to leaf node list if no other
441                  * child block cached.
442                  */
443                 if (list_empty(&upper->lower)) {
444                         list_add_tail(&upper->lower, &cache->leaves);
445                         upper->lowest = 1;
446                 }
447         }
448
449         drop_backref_node(cache, node);
450 }
451
452 static void update_backref_node(struct backref_cache *cache,
453                                 struct backref_node *node, u64 bytenr)
454 {
455         struct rb_node *rb_node;
456         rb_erase(&node->rb_node, &cache->rb_root);
457         node->bytenr = bytenr;
458         rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
459         if (rb_node)
460                 backref_tree_panic(rb_node, -EEXIST, bytenr);
461 }
462
463 /*
464  * update backref cache after a transaction commit
465  */
466 static int update_backref_cache(struct btrfs_trans_handle *trans,
467                                 struct backref_cache *cache)
468 {
469         struct backref_node *node;
470         int level = 0;
471
472         if (cache->last_trans == 0) {
473                 cache->last_trans = trans->transid;
474                 return 0;
475         }
476
477         if (cache->last_trans == trans->transid)
478                 return 0;
479
480         /*
481          * detached nodes are used to avoid unnecessary backref
482          * lookup. transaction commit changes the extent tree.
483          * so the detached nodes are no longer useful.
484          */
485         while (!list_empty(&cache->detached)) {
486                 node = list_entry(cache->detached.next,
487                                   struct backref_node, list);
488                 remove_backref_node(cache, node);
489         }
490
491         while (!list_empty(&cache->changed)) {
492                 node = list_entry(cache->changed.next,
493                                   struct backref_node, list);
494                 list_del_init(&node->list);
495                 BUG_ON(node->pending);
496                 update_backref_node(cache, node, node->new_bytenr);
497         }
498
499         /*
500          * some nodes can be left in the pending list if there were
501          * errors during processing the pending nodes.
502          */
503         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
504                 list_for_each_entry(node, &cache->pending[level], list) {
505                         BUG_ON(!node->pending);
506                         if (node->bytenr == node->new_bytenr)
507                                 continue;
508                         update_backref_node(cache, node, node->new_bytenr);
509                 }
510         }
511
512         cache->last_trans = 0;
513         return 1;
514 }
515
516
517 static int should_ignore_root(struct btrfs_root *root)
518 {
519         struct btrfs_root *reloc_root;
520
521         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
522                 return 0;
523
524         reloc_root = root->reloc_root;
525         if (!reloc_root)
526                 return 0;
527
528         if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
529             root->fs_info->running_transaction->transid - 1)
530                 return 0;
531         /*
532          * if there is reloc tree and it was created in previous
533          * transaction backref lookup can find the reloc tree,
534          * so backref node for the fs tree root is useless for
535          * relocation.
536          */
537         return 1;
538 }
539 /*
540  * find reloc tree by address of tree root
541  */
542 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
543                                           u64 bytenr)
544 {
545         struct rb_node *rb_node;
546         struct mapping_node *node;
547         struct btrfs_root *root = NULL;
548
549         spin_lock(&rc->reloc_root_tree.lock);
550         rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
551         if (rb_node) {
552                 node = rb_entry(rb_node, struct mapping_node, rb_node);
553                 root = (struct btrfs_root *)node->data;
554         }
555         spin_unlock(&rc->reloc_root_tree.lock);
556         return root;
557 }
558
559 static int is_cowonly_root(u64 root_objectid)
560 {
561         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
562             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
563             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
564             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
565             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
566             root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
567             root_objectid == BTRFS_UUID_TREE_OBJECTID ||
568             root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
569             root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
570                 return 1;
571         return 0;
572 }
573
574 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
575                                         u64 root_objectid)
576 {
577         struct btrfs_key key;
578
579         key.objectid = root_objectid;
580         key.type = BTRFS_ROOT_ITEM_KEY;
581         if (is_cowonly_root(root_objectid))
582                 key.offset = 0;
583         else
584                 key.offset = (u64)-1;
585
586         return btrfs_get_fs_root(fs_info, &key, false);
587 }
588
589 static noinline_for_stack
590 int find_inline_backref(struct extent_buffer *leaf, int slot,
591                         unsigned long *ptr, unsigned long *end)
592 {
593         struct btrfs_key key;
594         struct btrfs_extent_item *ei;
595         struct btrfs_tree_block_info *bi;
596         u32 item_size;
597
598         btrfs_item_key_to_cpu(leaf, &key, slot);
599
600         item_size = btrfs_item_size_nr(leaf, slot);
601         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
602         WARN_ON(!(btrfs_extent_flags(leaf, ei) &
603                   BTRFS_EXTENT_FLAG_TREE_BLOCK));
604
605         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
606             item_size <= sizeof(*ei) + sizeof(*bi)) {
607                 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
608                 return 1;
609         }
610         if (key.type == BTRFS_METADATA_ITEM_KEY &&
611             item_size <= sizeof(*ei)) {
612                 WARN_ON(item_size < sizeof(*ei));
613                 return 1;
614         }
615
616         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
617                 bi = (struct btrfs_tree_block_info *)(ei + 1);
618                 *ptr = (unsigned long)(bi + 1);
619         } else {
620                 *ptr = (unsigned long)(ei + 1);
621         }
622         *end = (unsigned long)ei + item_size;
623         return 0;
624 }
625
626 /*
627  * build backref tree for a given tree block. root of the backref tree
628  * corresponds the tree block, leaves of the backref tree correspond
629  * roots of b-trees that reference the tree block.
630  *
631  * the basic idea of this function is check backrefs of a given block
632  * to find upper level blocks that reference the block, and then check
633  * backrefs of these upper level blocks recursively. the recursion stop
634  * when tree root is reached or backrefs for the block is cached.
635  *
636  * NOTE: if we find backrefs for a block are cached, we know backrefs
637  * for all upper level blocks that directly/indirectly reference the
638  * block are also cached.
639  */
640 static noinline_for_stack
641 struct backref_node *build_backref_tree(struct reloc_control *rc,
642                                         struct btrfs_key *node_key,
643                                         int level, u64 bytenr)
644 {
645         struct backref_cache *cache = &rc->backref_cache;
646         struct btrfs_path *path1;
647         struct btrfs_path *path2;
648         struct extent_buffer *eb;
649         struct btrfs_root *root;
650         struct backref_node *cur;
651         struct backref_node *upper;
652         struct backref_node *lower;
653         struct backref_node *node = NULL;
654         struct backref_node *exist = NULL;
655         struct backref_edge *edge;
656         struct rb_node *rb_node;
657         struct btrfs_key key;
658         unsigned long end;
659         unsigned long ptr;
660         LIST_HEAD(list);
661         LIST_HEAD(useless);
662         int cowonly;
663         int ret;
664         int err = 0;
665         bool need_check = true;
666
667         path1 = btrfs_alloc_path();
668         path2 = btrfs_alloc_path();
669         if (!path1 || !path2) {
670                 err = -ENOMEM;
671                 goto out;
672         }
673         path1->reada = READA_FORWARD;
674         path2->reada = READA_FORWARD;
675
676         node = alloc_backref_node(cache);
677         if (!node) {
678                 err = -ENOMEM;
679                 goto out;
680         }
681
682         node->bytenr = bytenr;
683         node->level = level;
684         node->lowest = 1;
685         cur = node;
686 again:
687         end = 0;
688         ptr = 0;
689         key.objectid = cur->bytenr;
690         key.type = BTRFS_METADATA_ITEM_KEY;
691         key.offset = (u64)-1;
692
693         path1->search_commit_root = 1;
694         path1->skip_locking = 1;
695         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
696                                 0, 0);
697         if (ret < 0) {
698                 err = ret;
699                 goto out;
700         }
701         ASSERT(ret);
702         ASSERT(path1->slots[0]);
703
704         path1->slots[0]--;
705
706         WARN_ON(cur->checked);
707         if (!list_empty(&cur->upper)) {
708                 /*
709                  * the backref was added previously when processing
710                  * backref of type BTRFS_TREE_BLOCK_REF_KEY
711                  */
712                 ASSERT(list_is_singular(&cur->upper));
713                 edge = list_entry(cur->upper.next, struct backref_edge,
714                                   list[LOWER]);
715                 ASSERT(list_empty(&edge->list[UPPER]));
716                 exist = edge->node[UPPER];
717                 /*
718                  * add the upper level block to pending list if we need
719                  * check its backrefs
720                  */
721                 if (!exist->checked)
722                         list_add_tail(&edge->list[UPPER], &list);
723         } else {
724                 exist = NULL;
725         }
726
727         while (1) {
728                 cond_resched();
729                 eb = path1->nodes[0];
730
731                 if (ptr >= end) {
732                         if (path1->slots[0] >= btrfs_header_nritems(eb)) {
733                                 ret = btrfs_next_leaf(rc->extent_root, path1);
734                                 if (ret < 0) {
735                                         err = ret;
736                                         goto out;
737                                 }
738                                 if (ret > 0)
739                                         break;
740                                 eb = path1->nodes[0];
741                         }
742
743                         btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
744                         if (key.objectid != cur->bytenr) {
745                                 WARN_ON(exist);
746                                 break;
747                         }
748
749                         if (key.type == BTRFS_EXTENT_ITEM_KEY ||
750                             key.type == BTRFS_METADATA_ITEM_KEY) {
751                                 ret = find_inline_backref(eb, path1->slots[0],
752                                                           &ptr, &end);
753                                 if (ret)
754                                         goto next;
755                         }
756                 }
757
758                 if (ptr < end) {
759                         /* update key for inline back ref */
760                         struct btrfs_extent_inline_ref *iref;
761                         int type;
762                         iref = (struct btrfs_extent_inline_ref *)ptr;
763                         type = btrfs_get_extent_inline_ref_type(eb, iref,
764                                                         BTRFS_REF_TYPE_BLOCK);
765                         if (type == BTRFS_REF_TYPE_INVALID) {
766                                 err = -EUCLEAN;
767                                 goto out;
768                         }
769                         key.type = type;
770                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
771
772                         WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
773                                 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
774                 }
775
776                 if (exist &&
777                     ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
778                       exist->owner == key.offset) ||
779                      (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
780                       exist->bytenr == key.offset))) {
781                         exist = NULL;
782                         goto next;
783                 }
784
785                 ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
786                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
787                         if (key.objectid == key.offset) {
788                                 /*
789                                  * only root blocks of reloc trees use
790                                  * backref of this type.
791                                  */
792                                 root = find_reloc_root(rc, cur->bytenr);
793                                 ASSERT(root);
794                                 cur->root = root;
795                                 break;
796                         }
797
798                         edge = alloc_backref_edge(cache);
799                         if (!edge) {
800                                 err = -ENOMEM;
801                                 goto out;
802                         }
803                         rb_node = tree_search(&cache->rb_root, key.offset);
804                         if (!rb_node) {
805                                 upper = alloc_backref_node(cache);
806                                 if (!upper) {
807                                         free_backref_edge(cache, edge);
808                                         err = -ENOMEM;
809                                         goto out;
810                                 }
811                                 upper->bytenr = key.offset;
812                                 upper->level = cur->level + 1;
813                                 /*
814                                  *  backrefs for the upper level block isn't
815                                  *  cached, add the block to pending list
816                                  */
817                                 list_add_tail(&edge->list[UPPER], &list);
818                         } else {
819                                 upper = rb_entry(rb_node, struct backref_node,
820                                                  rb_node);
821                                 ASSERT(upper->checked);
822                                 INIT_LIST_HEAD(&edge->list[UPPER]);
823                         }
824                         list_add_tail(&edge->list[LOWER], &cur->upper);
825                         edge->node[LOWER] = cur;
826                         edge->node[UPPER] = upper;
827
828                         goto next;
829                 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
830                         goto next;
831                 }
832
833                 /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
834                 root = read_fs_root(rc->extent_root->fs_info, key.offset);
835                 if (IS_ERR(root)) {
836                         err = PTR_ERR(root);
837                         goto out;
838                 }
839
840                 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
841                         cur->cowonly = 1;
842
843                 if (btrfs_root_level(&root->root_item) == cur->level) {
844                         /* tree root */
845                         ASSERT(btrfs_root_bytenr(&root->root_item) ==
846                                cur->bytenr);
847                         if (should_ignore_root(root))
848                                 list_add(&cur->list, &useless);
849                         else
850                                 cur->root = root;
851                         break;
852                 }
853
854                 level = cur->level + 1;
855
856                 /*
857                  * searching the tree to find upper level blocks
858                  * reference the block.
859                  */
860                 path2->search_commit_root = 1;
861                 path2->skip_locking = 1;
862                 path2->lowest_level = level;
863                 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
864                 path2->lowest_level = 0;
865                 if (ret < 0) {
866                         err = ret;
867                         goto out;
868                 }
869                 if (ret > 0 && path2->slots[level] > 0)
870                         path2->slots[level]--;
871
872                 eb = path2->nodes[level];
873                 if (btrfs_node_blockptr(eb, path2->slots[level]) !=
874                     cur->bytenr) {
875                         btrfs_err(root->fs_info,
876         "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
877                                   cur->bytenr, level - 1, root->objectid,
878                                   node_key->objectid, node_key->type,
879                                   node_key->offset);
880                         err = -ENOENT;
881                         goto out;
882                 }
883                 lower = cur;
884                 need_check = true;
885                 for (; level < BTRFS_MAX_LEVEL; level++) {
886                         if (!path2->nodes[level]) {
887                                 ASSERT(btrfs_root_bytenr(&root->root_item) ==
888                                        lower->bytenr);
889                                 if (should_ignore_root(root))
890                                         list_add(&lower->list, &useless);
891                                 else
892                                         lower->root = root;
893                                 break;
894                         }
895
896                         edge = alloc_backref_edge(cache);
897                         if (!edge) {
898                                 err = -ENOMEM;
899                                 goto out;
900                         }
901
902                         eb = path2->nodes[level];
903                         rb_node = tree_search(&cache->rb_root, eb->start);
904                         if (!rb_node) {
905                                 upper = alloc_backref_node(cache);
906                                 if (!upper) {
907                                         free_backref_edge(cache, edge);
908                                         err = -ENOMEM;
909                                         goto out;
910                                 }
911                                 upper->bytenr = eb->start;
912                                 upper->owner = btrfs_header_owner(eb);
913                                 upper->level = lower->level + 1;
914                                 if (!test_bit(BTRFS_ROOT_REF_COWS,
915                                               &root->state))
916                                         upper->cowonly = 1;
917
918                                 /*
919                                  * if we know the block isn't shared
920                                  * we can void checking its backrefs.
921                                  */
922                                 if (btrfs_block_can_be_shared(root, eb))
923                                         upper->checked = 0;
924                                 else
925                                         upper->checked = 1;
926
927                                 /*
928                                  * add the block to pending list if we
929                                  * need check its backrefs, we only do this once
930                                  * while walking up a tree as we will catch
931                                  * anything else later on.
932                                  */
933                                 if (!upper->checked && need_check) {
934                                         need_check = false;
935                                         list_add_tail(&edge->list[UPPER],
936                                                       &list);
937                                 } else {
938                                         if (upper->checked)
939                                                 need_check = true;
940                                         INIT_LIST_HEAD(&edge->list[UPPER]);
941                                 }
942                         } else {
943                                 upper = rb_entry(rb_node, struct backref_node,
944                                                  rb_node);
945                                 ASSERT(upper->checked);
946                                 INIT_LIST_HEAD(&edge->list[UPPER]);
947                                 if (!upper->owner)
948                                         upper->owner = btrfs_header_owner(eb);
949                         }
950                         list_add_tail(&edge->list[LOWER], &lower->upper);
951                         edge->node[LOWER] = lower;
952                         edge->node[UPPER] = upper;
953
954                         if (rb_node)
955                                 break;
956                         lower = upper;
957                         upper = NULL;
958                 }
959                 btrfs_release_path(path2);
960 next:
961                 if (ptr < end) {
962                         ptr += btrfs_extent_inline_ref_size(key.type);
963                         if (ptr >= end) {
964                                 WARN_ON(ptr > end);
965                                 ptr = 0;
966                                 end = 0;
967                         }
968                 }
969                 if (ptr >= end)
970                         path1->slots[0]++;
971         }
972         btrfs_release_path(path1);
973
974         cur->checked = 1;
975         WARN_ON(exist);
976
977         /* the pending list isn't empty, take the first block to process */
978         if (!list_empty(&list)) {
979                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
980                 list_del_init(&edge->list[UPPER]);
981                 cur = edge->node[UPPER];
982                 goto again;
983         }
984
985         /*
986          * everything goes well, connect backref nodes and insert backref nodes
987          * into the cache.
988          */
989         ASSERT(node->checked);
990         cowonly = node->cowonly;
991         if (!cowonly) {
992                 rb_node = tree_insert(&cache->rb_root, node->bytenr,
993                                       &node->rb_node);
994                 if (rb_node)
995                         backref_tree_panic(rb_node, -EEXIST, node->bytenr);
996                 list_add_tail(&node->lower, &cache->leaves);
997         }
998
999         list_for_each_entry(edge, &node->upper, list[LOWER])
1000                 list_add_tail(&edge->list[UPPER], &list);
1001
1002         while (!list_empty(&list)) {
1003                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1004                 list_del_init(&edge->list[UPPER]);
1005                 upper = edge->node[UPPER];
1006                 if (upper->detached) {
1007                         list_del(&edge->list[LOWER]);
1008                         lower = edge->node[LOWER];
1009                         free_backref_edge(cache, edge);
1010                         if (list_empty(&lower->upper))
1011                                 list_add(&lower->list, &useless);
1012                         continue;
1013                 }
1014
1015                 if (!RB_EMPTY_NODE(&upper->rb_node)) {
1016                         if (upper->lowest) {
1017                                 list_del_init(&upper->lower);
1018                                 upper->lowest = 0;
1019                         }
1020
1021                         list_add_tail(&edge->list[UPPER], &upper->lower);
1022                         continue;
1023                 }
1024
1025                 if (!upper->checked) {
1026                         /*
1027                          * Still want to blow up for developers since this is a
1028                          * logic bug.
1029                          */
1030                         ASSERT(0);
1031                         err = -EINVAL;
1032                         goto out;
1033                 }
1034                 if (cowonly != upper->cowonly) {
1035                         ASSERT(0);
1036                         err = -EINVAL;
1037                         goto out;
1038                 }
1039
1040                 if (!cowonly) {
1041                         rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1042                                               &upper->rb_node);
1043                         if (rb_node)
1044                                 backref_tree_panic(rb_node, -EEXIST,
1045                                                    upper->bytenr);
1046                 }
1047
1048                 list_add_tail(&edge->list[UPPER], &upper->lower);
1049
1050                 list_for_each_entry(edge, &upper->upper, list[LOWER])
1051                         list_add_tail(&edge->list[UPPER], &list);
1052         }
1053         /*
1054          * process useless backref nodes. backref nodes for tree leaves
1055          * are deleted from the cache. backref nodes for upper level
1056          * tree blocks are left in the cache to avoid unnecessary backref
1057          * lookup.
1058          */
1059         while (!list_empty(&useless)) {
1060                 upper = list_entry(useless.next, struct backref_node, list);
1061                 list_del_init(&upper->list);
1062                 ASSERT(list_empty(&upper->upper));
1063                 if (upper == node)
1064                         node = NULL;
1065                 if (upper->lowest) {
1066                         list_del_init(&upper->lower);
1067                         upper->lowest = 0;
1068                 }
1069                 while (!list_empty(&upper->lower)) {
1070                         edge = list_entry(upper->lower.next,
1071                                           struct backref_edge, list[UPPER]);
1072                         list_del(&edge->list[UPPER]);
1073                         list_del(&edge->list[LOWER]);
1074                         lower = edge->node[LOWER];
1075                         free_backref_edge(cache, edge);
1076
1077                         if (list_empty(&lower->upper))
1078                                 list_add(&lower->list, &useless);
1079                 }
1080                 __mark_block_processed(rc, upper);
1081                 if (upper->level > 0) {
1082                         list_add(&upper->list, &cache->detached);
1083                         upper->detached = 1;
1084                 } else {
1085                         rb_erase(&upper->rb_node, &cache->rb_root);
1086                         free_backref_node(cache, upper);
1087                 }
1088         }
1089 out:
1090         btrfs_free_path(path1);
1091         btrfs_free_path(path2);
1092         if (err) {
1093                 while (!list_empty(&useless)) {
1094                         lower = list_entry(useless.next,
1095                                            struct backref_node, list);
1096                         list_del_init(&lower->list);
1097                 }
1098                 while (!list_empty(&list)) {
1099                         edge = list_first_entry(&list, struct backref_edge,
1100                                                 list[UPPER]);
1101                         list_del(&edge->list[UPPER]);
1102                         list_del(&edge->list[LOWER]);
1103                         lower = edge->node[LOWER];
1104                         upper = edge->node[UPPER];
1105                         free_backref_edge(cache, edge);
1106
1107                         /*
1108                          * Lower is no longer linked to any upper backref nodes
1109                          * and isn't in the cache, we can free it ourselves.
1110                          */
1111                         if (list_empty(&lower->upper) &&
1112                             RB_EMPTY_NODE(&lower->rb_node))
1113                                 list_add(&lower->list, &useless);
1114
1115                         if (!RB_EMPTY_NODE(&upper->rb_node))
1116                                 continue;
1117
1118                         /* Add this guy's upper edges to the list to process */
1119                         list_for_each_entry(edge, &upper->upper, list[LOWER])
1120                                 list_add_tail(&edge->list[UPPER], &list);
1121                         if (list_empty(&upper->upper))
1122                                 list_add(&upper->list, &useless);
1123                 }
1124
1125                 while (!list_empty(&useless)) {
1126                         lower = list_entry(useless.next,
1127                                            struct backref_node, list);
1128                         list_del_init(&lower->list);
1129                         if (lower == node)
1130                                 node = NULL;
1131                         free_backref_node(cache, lower);
1132                 }
1133
1134                 free_backref_node(cache, node);
1135                 return ERR_PTR(err);
1136         }
1137         ASSERT(!node || !node->detached);
1138         return node;
1139 }
1140
1141 /*
1142  * helper to add backref node for the newly created snapshot.
1143  * the backref node is created by cloning backref node that
1144  * corresponds to root of source tree
1145  */
1146 static int clone_backref_node(struct btrfs_trans_handle *trans,
1147                               struct reloc_control *rc,
1148                               struct btrfs_root *src,
1149                               struct btrfs_root *dest)
1150 {
1151         struct btrfs_root *reloc_root = src->reloc_root;
1152         struct backref_cache *cache = &rc->backref_cache;
1153         struct backref_node *node = NULL;
1154         struct backref_node *new_node;
1155         struct backref_edge *edge;
1156         struct backref_edge *new_edge;
1157         struct rb_node *rb_node;
1158
1159         if (cache->last_trans > 0)
1160                 update_backref_cache(trans, cache);
1161
1162         rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1163         if (rb_node) {
1164                 node = rb_entry(rb_node, struct backref_node, rb_node);
1165                 if (node->detached)
1166                         node = NULL;
1167                 else
1168                         BUG_ON(node->new_bytenr != reloc_root->node->start);
1169         }
1170
1171         if (!node) {
1172                 rb_node = tree_search(&cache->rb_root,
1173                                       reloc_root->commit_root->start);
1174                 if (rb_node) {
1175                         node = rb_entry(rb_node, struct backref_node,
1176                                         rb_node);
1177                         BUG_ON(node->detached);
1178                 }
1179         }
1180
1181         if (!node)
1182                 return 0;
1183
1184         new_node = alloc_backref_node(cache);
1185         if (!new_node)
1186                 return -ENOMEM;
1187
1188         new_node->bytenr = dest->node->start;
1189         new_node->level = node->level;
1190         new_node->lowest = node->lowest;
1191         new_node->checked = 1;
1192         new_node->root = dest;
1193
1194         if (!node->lowest) {
1195                 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1196                         new_edge = alloc_backref_edge(cache);
1197                         if (!new_edge)
1198                                 goto fail;
1199
1200                         new_edge->node[UPPER] = new_node;
1201                         new_edge->node[LOWER] = edge->node[LOWER];
1202                         list_add_tail(&new_edge->list[UPPER],
1203                                       &new_node->lower);
1204                 }
1205         } else {
1206                 list_add_tail(&new_node->lower, &cache->leaves);
1207         }
1208
1209         rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1210                               &new_node->rb_node);
1211         if (rb_node)
1212                 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1213
1214         if (!new_node->lowest) {
1215                 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1216                         list_add_tail(&new_edge->list[LOWER],
1217                                       &new_edge->node[LOWER]->upper);
1218                 }
1219         }
1220         return 0;
1221 fail:
1222         while (!list_empty(&new_node->lower)) {
1223                 new_edge = list_entry(new_node->lower.next,
1224                                       struct backref_edge, list[UPPER]);
1225                 list_del(&new_edge->list[UPPER]);
1226                 free_backref_edge(cache, new_edge);
1227         }
1228         free_backref_node(cache, new_node);
1229         return -ENOMEM;
1230 }
1231
1232 /*
1233  * helper to add 'address of tree root -> reloc tree' mapping
1234  */
1235 static int __must_check __add_reloc_root(struct btrfs_root *root)
1236 {
1237         struct btrfs_fs_info *fs_info = root->fs_info;
1238         struct rb_node *rb_node;
1239         struct mapping_node *node;
1240         struct reloc_control *rc = fs_info->reloc_ctl;
1241
1242         node = kmalloc(sizeof(*node), GFP_NOFS);
1243         if (!node)
1244                 return -ENOMEM;
1245
1246         node->bytenr = root->node->start;
1247         node->data = root;
1248
1249         spin_lock(&rc->reloc_root_tree.lock);
1250         rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1251                               node->bytenr, &node->rb_node);
1252         spin_unlock(&rc->reloc_root_tree.lock);
1253         if (rb_node) {
1254                 btrfs_panic(fs_info, -EEXIST,
1255                             "Duplicate root found for start=%llu while inserting into relocation tree",
1256                             node->bytenr);
1257         }
1258
1259         list_add_tail(&root->root_list, &rc->reloc_roots);
1260         return 0;
1261 }
1262
1263 /*
1264  * helper to delete the 'address of tree root -> reloc tree'
1265  * mapping
1266  */
1267 static void __del_reloc_root(struct btrfs_root *root)
1268 {
1269         struct btrfs_fs_info *fs_info = root->fs_info;
1270         struct rb_node *rb_node;
1271         struct mapping_node *node = NULL;
1272         struct reloc_control *rc = fs_info->reloc_ctl;
1273
1274         spin_lock(&rc->reloc_root_tree.lock);
1275         rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1276                               root->node->start);
1277         if (rb_node) {
1278                 node = rb_entry(rb_node, struct mapping_node, rb_node);
1279                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1280         }
1281         spin_unlock(&rc->reloc_root_tree.lock);
1282
1283         if (!node)
1284                 return;
1285         BUG_ON((struct btrfs_root *)node->data != root);
1286
1287         spin_lock(&fs_info->trans_lock);
1288         list_del_init(&root->root_list);
1289         spin_unlock(&fs_info->trans_lock);
1290         kfree(node);
1291 }
1292
1293 /*
1294  * helper to update the 'address of tree root -> reloc tree'
1295  * mapping
1296  */
1297 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1298 {
1299         struct btrfs_fs_info *fs_info = root->fs_info;
1300         struct rb_node *rb_node;
1301         struct mapping_node *node = NULL;
1302         struct reloc_control *rc = fs_info->reloc_ctl;
1303
1304         spin_lock(&rc->reloc_root_tree.lock);
1305         rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1306                               root->node->start);
1307         if (rb_node) {
1308                 node = rb_entry(rb_node, struct mapping_node, rb_node);
1309                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1310         }
1311         spin_unlock(&rc->reloc_root_tree.lock);
1312
1313         if (!node)
1314                 return 0;
1315         BUG_ON((struct btrfs_root *)node->data != root);
1316
1317         spin_lock(&rc->reloc_root_tree.lock);
1318         node->bytenr = new_bytenr;
1319         rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1320                               node->bytenr, &node->rb_node);
1321         spin_unlock(&rc->reloc_root_tree.lock);
1322         if (rb_node)
1323                 backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1324         return 0;
1325 }
1326
1327 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1328                                         struct btrfs_root *root, u64 objectid)
1329 {
1330         struct btrfs_fs_info *fs_info = root->fs_info;
1331         struct btrfs_root *reloc_root;
1332         struct extent_buffer *eb;
1333         struct btrfs_root_item *root_item;
1334         struct btrfs_key root_key;
1335         int ret;
1336
1337         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1338         BUG_ON(!root_item);
1339
1340         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1341         root_key.type = BTRFS_ROOT_ITEM_KEY;
1342         root_key.offset = objectid;
1343
1344         if (root->root_key.objectid == objectid) {
1345                 u64 commit_root_gen;
1346
1347                 /* called by btrfs_init_reloc_root */
1348                 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1349                                       BTRFS_TREE_RELOC_OBJECTID);
1350                 BUG_ON(ret);
1351                 /*
1352                  * Set the last_snapshot field to the generation of the commit
1353                  * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1354                  * correctly (returns true) when the relocation root is created
1355                  * either inside the critical section of a transaction commit
1356                  * (through transaction.c:qgroup_account_snapshot()) and when
1357                  * it's created before the transaction commit is started.
1358                  */
1359                 commit_root_gen = btrfs_header_generation(root->commit_root);
1360                 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
1361         } else {
1362                 /*
1363                  * called by btrfs_reloc_post_snapshot_hook.
1364                  * the source tree is a reloc tree, all tree blocks
1365                  * modified after it was created have RELOC flag
1366                  * set in their headers. so it's OK to not update
1367                  * the 'last_snapshot'.
1368                  */
1369                 ret = btrfs_copy_root(trans, root, root->node, &eb,
1370                                       BTRFS_TREE_RELOC_OBJECTID);
1371                 BUG_ON(ret);
1372         }
1373
1374         memcpy(root_item, &root->root_item, sizeof(*root_item));
1375         btrfs_set_root_bytenr(root_item, eb->start);
1376         btrfs_set_root_level(root_item, btrfs_header_level(eb));
1377         btrfs_set_root_generation(root_item, trans->transid);
1378
1379         if (root->root_key.objectid == objectid) {
1380                 btrfs_set_root_refs(root_item, 0);
1381                 memset(&root_item->drop_progress, 0,
1382                        sizeof(struct btrfs_disk_key));
1383                 root_item->drop_level = 0;
1384         }
1385
1386         btrfs_tree_unlock(eb);
1387         free_extent_buffer(eb);
1388
1389         ret = btrfs_insert_root(trans, fs_info->tree_root,
1390                                 &root_key, root_item);
1391         BUG_ON(ret);
1392         kfree(root_item);
1393
1394         reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
1395         BUG_ON(IS_ERR(reloc_root));
1396         reloc_root->last_trans = trans->transid;
1397         return reloc_root;
1398 }
1399
1400 /*
1401  * create reloc tree for a given fs tree. reloc tree is just a
1402  * snapshot of the fs tree with special root objectid.
1403  */
1404 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1405                           struct btrfs_root *root)
1406 {
1407         struct btrfs_fs_info *fs_info = root->fs_info;
1408         struct btrfs_root *reloc_root;
1409         struct reloc_control *rc = fs_info->reloc_ctl;
1410         struct btrfs_block_rsv *rsv;
1411         int clear_rsv = 0;
1412         int ret;
1413
1414         if (root->reloc_root) {
1415                 reloc_root = root->reloc_root;
1416                 reloc_root->last_trans = trans->transid;
1417                 return 0;
1418         }
1419
1420         if (!rc || !rc->create_reloc_tree ||
1421             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1422                 return 0;
1423
1424         if (!trans->reloc_reserved) {
1425                 rsv = trans->block_rsv;
1426                 trans->block_rsv = rc->block_rsv;
1427                 clear_rsv = 1;
1428         }
1429         reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1430         if (clear_rsv)
1431                 trans->block_rsv = rsv;
1432
1433         ret = __add_reloc_root(reloc_root);
1434         BUG_ON(ret < 0);
1435         root->reloc_root = reloc_root;
1436         return 0;
1437 }
1438
1439 /*
1440  * update root item of reloc tree
1441  */
1442 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1443                             struct btrfs_root *root)
1444 {
1445         struct btrfs_fs_info *fs_info = root->fs_info;
1446         struct btrfs_root *reloc_root;
1447         struct btrfs_root_item *root_item;
1448         int ret;
1449
1450         if (!root->reloc_root)
1451                 goto out;
1452
1453         reloc_root = root->reloc_root;
1454         root_item = &reloc_root->root_item;
1455
1456         if (fs_info->reloc_ctl->merge_reloc_tree &&
1457             btrfs_root_refs(root_item) == 0) {
1458                 root->reloc_root = NULL;
1459                 __del_reloc_root(reloc_root);
1460         }
1461
1462         if (reloc_root->commit_root != reloc_root->node) {
1463                 btrfs_set_root_node(root_item, reloc_root->node);
1464                 free_extent_buffer(reloc_root->commit_root);
1465                 reloc_root->commit_root = btrfs_root_node(reloc_root);
1466         }
1467
1468         ret = btrfs_update_root(trans, fs_info->tree_root,
1469                                 &reloc_root->root_key, root_item);
1470         BUG_ON(ret);
1471
1472 out:
1473         return 0;
1474 }
1475
1476 /*
1477  * helper to find first cached inode with inode number >= objectid
1478  * in a subvolume
1479  */
1480 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1481 {
1482         struct rb_node *node;
1483         struct rb_node *prev;
1484         struct btrfs_inode *entry;
1485         struct inode *inode;
1486
1487         spin_lock(&root->inode_lock);
1488 again:
1489         node = root->inode_tree.rb_node;
1490         prev = NULL;
1491         while (node) {
1492                 prev = node;
1493                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1494
1495                 if (objectid < btrfs_ino(entry))
1496                         node = node->rb_left;
1497                 else if (objectid > btrfs_ino(entry))
1498                         node = node->rb_right;
1499                 else
1500                         break;
1501         }
1502         if (!node) {
1503                 while (prev) {
1504                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
1505                         if (objectid <= btrfs_ino(entry)) {
1506                                 node = prev;
1507                                 break;
1508                         }
1509                         prev = rb_next(prev);
1510                 }
1511         }
1512         while (node) {
1513                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1514                 inode = igrab(&entry->vfs_inode);
1515                 if (inode) {
1516                         spin_unlock(&root->inode_lock);
1517                         return inode;
1518                 }
1519
1520                 objectid = btrfs_ino(entry) + 1;
1521                 if (cond_resched_lock(&root->inode_lock))
1522                         goto again;
1523
1524                 node = rb_next(node);
1525         }
1526         spin_unlock(&root->inode_lock);
1527         return NULL;
1528 }
1529
1530 static int in_block_group(u64 bytenr,
1531                           struct btrfs_block_group_cache *block_group)
1532 {
1533         if (bytenr >= block_group->key.objectid &&
1534             bytenr < block_group->key.objectid + block_group->key.offset)
1535                 return 1;
1536         return 0;
1537 }
1538
1539 /*
1540  * get new location of data
1541  */
1542 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1543                             u64 bytenr, u64 num_bytes)
1544 {
1545         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1546         struct btrfs_path *path;
1547         struct btrfs_file_extent_item *fi;
1548         struct extent_buffer *leaf;
1549         int ret;
1550
1551         path = btrfs_alloc_path();
1552         if (!path)
1553                 return -ENOMEM;
1554
1555         bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1556         ret = btrfs_lookup_file_extent(NULL, root, path,
1557                         btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1558         if (ret < 0)
1559                 goto out;
1560         if (ret > 0) {
1561                 ret = -ENOENT;
1562                 goto out;
1563         }
1564
1565         leaf = path->nodes[0];
1566         fi = btrfs_item_ptr(leaf, path->slots[0],
1567                             struct btrfs_file_extent_item);
1568
1569         BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1570                btrfs_file_extent_compression(leaf, fi) ||
1571                btrfs_file_extent_encryption(leaf, fi) ||
1572                btrfs_file_extent_other_encoding(leaf, fi));
1573
1574         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1575                 ret = -EINVAL;
1576                 goto out;
1577         }
1578
1579         *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1580         ret = 0;
1581 out:
1582         btrfs_free_path(path);
1583         return ret;
1584 }
1585
1586 /*
1587  * update file extent items in the tree leaf to point to
1588  * the new locations.
1589  */
1590 static noinline_for_stack
1591 int replace_file_extents(struct btrfs_trans_handle *trans,
1592                          struct reloc_control *rc,
1593                          struct btrfs_root *root,
1594                          struct extent_buffer *leaf)
1595 {
1596         struct btrfs_fs_info *fs_info = root->fs_info;
1597         struct btrfs_key key;
1598         struct btrfs_file_extent_item *fi;
1599         struct inode *inode = NULL;
1600         u64 parent;
1601         u64 bytenr;
1602         u64 new_bytenr = 0;
1603         u64 num_bytes;
1604         u64 end;
1605         u32 nritems;
1606         u32 i;
1607         int ret = 0;
1608         int first = 1;
1609         int dirty = 0;
1610
1611         if (rc->stage != UPDATE_DATA_PTRS)
1612                 return 0;
1613
1614         /* reloc trees always use full backref */
1615         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1616                 parent = leaf->start;
1617         else
1618                 parent = 0;
1619
1620         nritems = btrfs_header_nritems(leaf);
1621         for (i = 0; i < nritems; i++) {
1622                 cond_resched();
1623                 btrfs_item_key_to_cpu(leaf, &key, i);
1624                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1625                         continue;
1626                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1627                 if (btrfs_file_extent_type(leaf, fi) ==
1628                     BTRFS_FILE_EXTENT_INLINE)
1629                         continue;
1630                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1631                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1632                 if (bytenr == 0)
1633                         continue;
1634                 if (!in_block_group(bytenr, rc->block_group))
1635                         continue;
1636
1637                 /*
1638                  * if we are modifying block in fs tree, wait for readpage
1639                  * to complete and drop the extent cache
1640                  */
1641                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1642                         if (first) {
1643                                 inode = find_next_inode(root, key.objectid);
1644                                 first = 0;
1645                         } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1646                                 btrfs_add_delayed_iput(inode);
1647                                 inode = find_next_inode(root, key.objectid);
1648                         }
1649                         if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1650                                 end = key.offset +
1651                                       btrfs_file_extent_num_bytes(leaf, fi);
1652                                 WARN_ON(!IS_ALIGNED(key.offset,
1653                                                     fs_info->sectorsize));
1654                                 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1655                                 end--;
1656                                 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1657                                                       key.offset, end);
1658                                 if (!ret)
1659                                         continue;
1660
1661                                 btrfs_drop_extent_cache(BTRFS_I(inode),
1662                                                 key.offset,     end, 1);
1663                                 unlock_extent(&BTRFS_I(inode)->io_tree,
1664                                               key.offset, end);
1665                         }
1666                 }
1667
1668                 ret = get_new_location(rc->data_inode, &new_bytenr,
1669                                        bytenr, num_bytes);
1670                 if (ret) {
1671                         /*
1672                          * Don't have to abort since we've not changed anything
1673                          * in the file extent yet.
1674                          */
1675                         break;
1676                 }
1677
1678                 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1679                 dirty = 1;
1680
1681                 key.offset -= btrfs_file_extent_offset(leaf, fi);
1682                 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1683                                            num_bytes, parent,
1684                                            btrfs_header_owner(leaf),
1685                                            key.objectid, key.offset);
1686                 if (ret) {
1687                         btrfs_abort_transaction(trans, ret);
1688                         break;
1689                 }
1690
1691                 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1692                                         parent, btrfs_header_owner(leaf),
1693                                         key.objectid, key.offset);
1694                 if (ret) {
1695                         btrfs_abort_transaction(trans, ret);
1696                         break;
1697                 }
1698         }
1699         if (dirty)
1700                 btrfs_mark_buffer_dirty(leaf);
1701         if (inode)
1702                 btrfs_add_delayed_iput(inode);
1703         return ret;
1704 }
1705
1706 static noinline_for_stack
1707 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1708                      struct btrfs_path *path, int level)
1709 {
1710         struct btrfs_disk_key key1;
1711         struct btrfs_disk_key key2;
1712         btrfs_node_key(eb, &key1, slot);
1713         btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1714         return memcmp(&key1, &key2, sizeof(key1));
1715 }
1716
1717 /*
1718  * try to replace tree blocks in fs tree with the new blocks
1719  * in reloc tree. tree blocks haven't been modified since the
1720  * reloc tree was create can be replaced.
1721  *
1722  * if a block was replaced, level of the block + 1 is returned.
1723  * if no block got replaced, 0 is returned. if there are other
1724  * errors, a negative error number is returned.
1725  */
1726 static noinline_for_stack
1727 int replace_path(struct btrfs_trans_handle *trans,
1728                  struct btrfs_root *dest, struct btrfs_root *src,
1729                  struct btrfs_path *path, struct btrfs_key *next_key,
1730                  int lowest_level, int max_level)
1731 {
1732         struct btrfs_fs_info *fs_info = dest->fs_info;
1733         struct extent_buffer *eb;
1734         struct extent_buffer *parent;
1735         struct btrfs_key key;
1736         u64 old_bytenr;
1737         u64 new_bytenr;
1738         u64 old_ptr_gen;
1739         u64 new_ptr_gen;
1740         u64 last_snapshot;
1741         u32 blocksize;
1742         int cow = 0;
1743         int level;
1744         int ret;
1745         int slot;
1746
1747         BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1748         BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1749
1750         last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1751 again:
1752         slot = path->slots[lowest_level];
1753         btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1754
1755         eb = btrfs_lock_root_node(dest);
1756         btrfs_set_lock_blocking(eb);
1757         level = btrfs_header_level(eb);
1758
1759         if (level < lowest_level) {
1760                 btrfs_tree_unlock(eb);
1761                 free_extent_buffer(eb);
1762                 return 0;
1763         }
1764
1765         if (cow) {
1766                 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1767                 BUG_ON(ret);
1768         }
1769         btrfs_set_lock_blocking(eb);
1770
1771         if (next_key) {
1772                 next_key->objectid = (u64)-1;
1773                 next_key->type = (u8)-1;
1774                 next_key->offset = (u64)-1;
1775         }
1776
1777         parent = eb;
1778         while (1) {
1779                 struct btrfs_key first_key;
1780
1781                 level = btrfs_header_level(parent);
1782                 BUG_ON(level < lowest_level);
1783
1784                 ret = btrfs_bin_search(parent, &key, level, &slot);
1785                 if (ret && slot > 0)
1786                         slot--;
1787
1788                 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1789                         btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1790
1791                 old_bytenr = btrfs_node_blockptr(parent, slot);
1792                 blocksize = fs_info->nodesize;
1793                 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1794                 btrfs_node_key_to_cpu(parent, &first_key, slot);
1795
1796                 if (level <= max_level) {
1797                         eb = path->nodes[level];
1798                         new_bytenr = btrfs_node_blockptr(eb,
1799                                                         path->slots[level]);
1800                         new_ptr_gen = btrfs_node_ptr_generation(eb,
1801                                                         path->slots[level]);
1802                 } else {
1803                         new_bytenr = 0;
1804                         new_ptr_gen = 0;
1805                 }
1806
1807                 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1808                         ret = level;
1809                         break;
1810                 }
1811
1812                 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1813                     memcmp_node_keys(parent, slot, path, level)) {
1814                         if (level <= lowest_level) {
1815                                 ret = 0;
1816                                 break;
1817                         }
1818
1819                         eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
1820                                              level - 1, &first_key);
1821                         if (IS_ERR(eb)) {
1822                                 ret = PTR_ERR(eb);
1823                                 break;
1824                         } else if (!extent_buffer_uptodate(eb)) {
1825                                 ret = -EIO;
1826                                 free_extent_buffer(eb);
1827                                 break;
1828                         }
1829                         btrfs_tree_lock(eb);
1830                         if (cow) {
1831                                 ret = btrfs_cow_block(trans, dest, eb, parent,
1832                                                       slot, &eb);
1833                                 BUG_ON(ret);
1834                         }
1835                         btrfs_set_lock_blocking(eb);
1836
1837                         btrfs_tree_unlock(parent);
1838                         free_extent_buffer(parent);
1839
1840                         parent = eb;
1841                         continue;
1842                 }
1843
1844                 if (!cow) {
1845                         btrfs_tree_unlock(parent);
1846                         free_extent_buffer(parent);
1847                         cow = 1;
1848                         goto again;
1849                 }
1850
1851                 btrfs_node_key_to_cpu(path->nodes[level], &key,
1852                                       path->slots[level]);
1853                 btrfs_release_path(path);
1854
1855                 path->lowest_level = level;
1856                 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1857                 path->lowest_level = 0;
1858                 BUG_ON(ret);
1859
1860                 /*
1861                  * Info qgroup to trace both subtrees.
1862                  *
1863                  * We must trace both trees.
1864                  * 1) Tree reloc subtree
1865                  *    If not traced, we will leak data numbers
1866                  * 2) Fs subtree
1867                  *    If not traced, we will double count old data
1868                  *    and tree block numbers, if current trans doesn't free
1869                  *    data reloc tree inode.
1870                  */
1871                 ret = btrfs_qgroup_trace_subtree(trans, src, parent,
1872                                 btrfs_header_generation(parent),
1873                                 btrfs_header_level(parent));
1874                 if (ret < 0)
1875                         break;
1876                 ret = btrfs_qgroup_trace_subtree(trans, dest,
1877                                 path->nodes[level],
1878                                 btrfs_header_generation(path->nodes[level]),
1879                                 btrfs_header_level(path->nodes[level]));
1880                 if (ret < 0)
1881                         break;
1882
1883                 /*
1884                  * swap blocks in fs tree and reloc tree.
1885                  */
1886                 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1887                 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1888                 btrfs_mark_buffer_dirty(parent);
1889
1890                 btrfs_set_node_blockptr(path->nodes[level],
1891                                         path->slots[level], old_bytenr);
1892                 btrfs_set_node_ptr_generation(path->nodes[level],
1893                                               path->slots[level], old_ptr_gen);
1894                 btrfs_mark_buffer_dirty(path->nodes[level]);
1895
1896                 ret = btrfs_inc_extent_ref(trans, src, old_bytenr,
1897                                         blocksize, path->nodes[level]->start,
1898                                         src->root_key.objectid, level - 1, 0);
1899                 BUG_ON(ret);
1900                 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr,
1901                                         blocksize, 0, dest->root_key.objectid,
1902                                         level - 1, 0);
1903                 BUG_ON(ret);
1904
1905                 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1906                                         path->nodes[level]->start,
1907                                         src->root_key.objectid, level - 1, 0);
1908                 BUG_ON(ret);
1909
1910                 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1911                                         0, dest->root_key.objectid, level - 1,
1912                                         0);
1913                 BUG_ON(ret);
1914
1915                 btrfs_unlock_up_safe(path, 0);
1916
1917                 ret = level;
1918                 break;
1919         }
1920         btrfs_tree_unlock(parent);
1921         free_extent_buffer(parent);
1922         return ret;
1923 }
1924
1925 /*
1926  * helper to find next relocated block in reloc tree
1927  */
1928 static noinline_for_stack
1929 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1930                        int *level)
1931 {
1932         struct extent_buffer *eb;
1933         int i;
1934         u64 last_snapshot;
1935         u32 nritems;
1936
1937         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1938
1939         for (i = 0; i < *level; i++) {
1940                 free_extent_buffer(path->nodes[i]);
1941                 path->nodes[i] = NULL;
1942         }
1943
1944         for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1945                 eb = path->nodes[i];
1946                 nritems = btrfs_header_nritems(eb);
1947                 while (path->slots[i] + 1 < nritems) {
1948                         path->slots[i]++;
1949                         if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1950                             last_snapshot)
1951                                 continue;
1952
1953                         *level = i;
1954                         return 0;
1955                 }
1956                 free_extent_buffer(path->nodes[i]);
1957                 path->nodes[i] = NULL;
1958         }
1959         return 1;
1960 }
1961
1962 /*
1963  * walk down reloc tree to find relocated block of lowest level
1964  */
1965 static noinline_for_stack
1966 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1967                          int *level)
1968 {
1969         struct btrfs_fs_info *fs_info = root->fs_info;
1970         struct extent_buffer *eb = NULL;
1971         int i;
1972         u64 bytenr;
1973         u64 ptr_gen = 0;
1974         u64 last_snapshot;
1975         u32 nritems;
1976
1977         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1978
1979         for (i = *level; i > 0; i--) {
1980                 struct btrfs_key first_key;
1981
1982                 eb = path->nodes[i];
1983                 nritems = btrfs_header_nritems(eb);
1984                 while (path->slots[i] < nritems) {
1985                         ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1986                         if (ptr_gen > last_snapshot)
1987                                 break;
1988                         path->slots[i]++;
1989                 }
1990                 if (path->slots[i] >= nritems) {
1991                         if (i == *level)
1992                                 break;
1993                         *level = i + 1;
1994                         return 0;
1995                 }
1996                 if (i == 1) {
1997                         *level = i;
1998                         return 0;
1999                 }
2000
2001                 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2002                 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
2003                 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
2004                                      &first_key);
2005                 if (IS_ERR(eb)) {
2006                         return PTR_ERR(eb);
2007                 } else if (!extent_buffer_uptodate(eb)) {
2008                         free_extent_buffer(eb);
2009                         return -EIO;
2010                 }
2011                 BUG_ON(btrfs_header_level(eb) != i - 1);
2012                 path->nodes[i - 1] = eb;
2013                 path->slots[i - 1] = 0;
2014         }
2015         return 1;
2016 }
2017
2018 /*
2019  * invalidate extent cache for file extents whose key in range of
2020  * [min_key, max_key)
2021  */
2022 static int invalidate_extent_cache(struct btrfs_root *root,
2023                                    struct btrfs_key *min_key,
2024                                    struct btrfs_key *max_key)
2025 {
2026         struct btrfs_fs_info *fs_info = root->fs_info;
2027         struct inode *inode = NULL;
2028         u64 objectid;
2029         u64 start, end;
2030         u64 ino;
2031
2032         objectid = min_key->objectid;
2033         while (1) {
2034                 cond_resched();
2035                 iput(inode);
2036
2037                 if (objectid > max_key->objectid)
2038                         break;
2039
2040                 inode = find_next_inode(root, objectid);
2041                 if (!inode)
2042                         break;
2043                 ino = btrfs_ino(BTRFS_I(inode));
2044
2045                 if (ino > max_key->objectid) {
2046                         iput(inode);
2047                         break;
2048                 }
2049
2050                 objectid = ino + 1;
2051                 if (!S_ISREG(inode->i_mode))
2052                         continue;
2053
2054                 if (unlikely(min_key->objectid == ino)) {
2055                         if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2056                                 continue;
2057                         if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2058                                 start = 0;
2059                         else {
2060                                 start = min_key->offset;
2061                                 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
2062                         }
2063                 } else {
2064                         start = 0;
2065                 }
2066
2067                 if (unlikely(max_key->objectid == ino)) {
2068                         if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2069                                 continue;
2070                         if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2071                                 end = (u64)-1;
2072                         } else {
2073                                 if (max_key->offset == 0)
2074                                         continue;
2075                                 end = max_key->offset;
2076                                 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
2077                                 end--;
2078                         }
2079                 } else {
2080                         end = (u64)-1;
2081                 }
2082
2083                 /* the lock_extent waits for readpage to complete */
2084                 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2085                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
2086                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2087         }
2088         return 0;
2089 }
2090
2091 static int find_next_key(struct btrfs_path *path, int level,
2092                          struct btrfs_key *key)
2093
2094 {
2095         while (level < BTRFS_MAX_LEVEL) {
2096                 if (!path->nodes[level])
2097                         break;
2098                 if (path->slots[level] + 1 <
2099                     btrfs_header_nritems(path->nodes[level])) {
2100                         btrfs_node_key_to_cpu(path->nodes[level], key,
2101                                               path->slots[level] + 1);
2102                         return 0;
2103                 }
2104                 level++;
2105         }
2106         return 1;
2107 }
2108
2109 /*
2110  * merge the relocated tree blocks in reloc tree with corresponding
2111  * fs tree.
2112  */
2113 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2114                                                struct btrfs_root *root)
2115 {
2116         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2117         LIST_HEAD(inode_list);
2118         struct btrfs_key key;
2119         struct btrfs_key next_key;
2120         struct btrfs_trans_handle *trans = NULL;
2121         struct btrfs_root *reloc_root;
2122         struct btrfs_root_item *root_item;
2123         struct btrfs_path *path;
2124         struct extent_buffer *leaf;
2125         int level;
2126         int max_level;
2127         int replaced = 0;
2128         int ret;
2129         int err = 0;
2130         u32 min_reserved;
2131
2132         path = btrfs_alloc_path();
2133         if (!path)
2134                 return -ENOMEM;
2135         path->reada = READA_FORWARD;
2136
2137         reloc_root = root->reloc_root;
2138         root_item = &reloc_root->root_item;
2139
2140         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2141                 level = btrfs_root_level(root_item);
2142                 extent_buffer_get(reloc_root->node);
2143                 path->nodes[level] = reloc_root->node;
2144                 path->slots[level] = 0;
2145         } else {
2146                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2147
2148                 level = root_item->drop_level;
2149                 BUG_ON(level == 0);
2150                 path->lowest_level = level;
2151                 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2152                 path->lowest_level = 0;
2153                 if (ret < 0) {
2154                         btrfs_free_path(path);
2155                         return ret;
2156                 }
2157
2158                 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2159                                       path->slots[level]);
2160                 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2161
2162                 btrfs_unlock_up_safe(path, 0);
2163         }
2164
2165         min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2166         memset(&next_key, 0, sizeof(next_key));
2167
2168         while (1) {
2169                 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2170                                              BTRFS_RESERVE_FLUSH_ALL);
2171                 if (ret) {
2172                         err = ret;
2173                         goto out;
2174                 }
2175                 trans = btrfs_start_transaction(root, 0);
2176                 if (IS_ERR(trans)) {
2177                         err = PTR_ERR(trans);
2178                         trans = NULL;
2179                         goto out;
2180                 }
2181                 trans->block_rsv = rc->block_rsv;
2182
2183                 replaced = 0;
2184                 max_level = level;
2185
2186                 ret = walk_down_reloc_tree(reloc_root, path, &level);
2187                 if (ret < 0) {
2188                         err = ret;
2189                         goto out;
2190                 }
2191                 if (ret > 0)
2192                         break;
2193
2194                 if (!find_next_key(path, level, &key) &&
2195                     btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2196                         ret = 0;
2197                 } else {
2198                         ret = replace_path(trans, root, reloc_root, path,
2199                                            &next_key, level, max_level);
2200                 }
2201                 if (ret < 0) {
2202                         err = ret;
2203                         goto out;
2204                 }
2205
2206                 if (ret > 0) {
2207                         level = ret;
2208                         btrfs_node_key_to_cpu(path->nodes[level], &key,
2209                                               path->slots[level]);
2210                         replaced = 1;
2211                 }
2212
2213                 ret = walk_up_reloc_tree(reloc_root, path, &level);
2214                 if (ret > 0)
2215                         break;
2216
2217                 BUG_ON(level == 0);
2218                 /*
2219                  * save the merging progress in the drop_progress.
2220                  * this is OK since root refs == 1 in this case.
2221                  */
2222                 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2223                                path->slots[level]);
2224                 root_item->drop_level = level;
2225
2226                 btrfs_end_transaction_throttle(trans);
2227                 trans = NULL;
2228
2229                 btrfs_btree_balance_dirty(fs_info);
2230
2231                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2232                         invalidate_extent_cache(root, &key, &next_key);
2233         }
2234
2235         /*
2236          * handle the case only one block in the fs tree need to be
2237          * relocated and the block is tree root.
2238          */
2239         leaf = btrfs_lock_root_node(root);
2240         ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2241         btrfs_tree_unlock(leaf);
2242         free_extent_buffer(leaf);
2243         if (ret < 0)
2244                 err = ret;
2245 out:
2246         btrfs_free_path(path);
2247
2248         if (err == 0) {
2249                 memset(&root_item->drop_progress, 0,
2250                        sizeof(root_item->drop_progress));
2251                 root_item->drop_level = 0;
2252                 btrfs_set_root_refs(root_item, 0);
2253                 btrfs_update_reloc_root(trans, root);
2254         }
2255
2256         if (trans)
2257                 btrfs_end_transaction_throttle(trans);
2258
2259         btrfs_btree_balance_dirty(fs_info);
2260
2261         if (replaced && rc->stage == UPDATE_DATA_PTRS)
2262                 invalidate_extent_cache(root, &key, &next_key);
2263
2264         return err;
2265 }
2266
2267 static noinline_for_stack
2268 int prepare_to_merge(struct reloc_control *rc, int err)
2269 {
2270         struct btrfs_root *root = rc->extent_root;
2271         struct btrfs_fs_info *fs_info = root->fs_info;
2272         struct btrfs_root *reloc_root;
2273         struct btrfs_trans_handle *trans;
2274         LIST_HEAD(reloc_roots);
2275         u64 num_bytes = 0;
2276         int ret;
2277
2278         mutex_lock(&fs_info->reloc_mutex);
2279         rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2280         rc->merging_rsv_size += rc->nodes_relocated * 2;
2281         mutex_unlock(&fs_info->reloc_mutex);
2282
2283 again:
2284         if (!err) {
2285                 num_bytes = rc->merging_rsv_size;
2286                 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2287                                           BTRFS_RESERVE_FLUSH_ALL);
2288                 if (ret)
2289                         err = ret;
2290         }
2291
2292         trans = btrfs_join_transaction(rc->extent_root);
2293         if (IS_ERR(trans)) {
2294                 if (!err)
2295                         btrfs_block_rsv_release(fs_info, rc->block_rsv,
2296                                                 num_bytes);
2297                 return PTR_ERR(trans);
2298         }
2299
2300         if (!err) {
2301                 if (num_bytes != rc->merging_rsv_size) {
2302                         btrfs_end_transaction(trans);
2303                         btrfs_block_rsv_release(fs_info, rc->block_rsv,
2304                                                 num_bytes);
2305                         goto again;
2306                 }
2307         }
2308
2309         rc->merge_reloc_tree = 1;
2310
2311         while (!list_empty(&rc->reloc_roots)) {
2312                 reloc_root = list_entry(rc->reloc_roots.next,
2313                                         struct btrfs_root, root_list);
2314                 list_del_init(&reloc_root->root_list);
2315
2316                 root = read_fs_root(fs_info, reloc_root->root_key.offset);
2317                 BUG_ON(IS_ERR(root));
2318                 BUG_ON(root->reloc_root != reloc_root);
2319
2320                 /*
2321                  * set reference count to 1, so btrfs_recover_relocation
2322                  * knows it should resumes merging
2323                  */
2324                 if (!err)
2325                         btrfs_set_root_refs(&reloc_root->root_item, 1);
2326                 btrfs_update_reloc_root(trans, root);
2327
2328                 list_add(&reloc_root->root_list, &reloc_roots);
2329         }
2330
2331         list_splice(&reloc_roots, &rc->reloc_roots);
2332
2333         if (!err)
2334                 btrfs_commit_transaction(trans);
2335         else
2336                 btrfs_end_transaction(trans);
2337         return err;
2338 }
2339
2340 static noinline_for_stack
2341 void free_reloc_roots(struct list_head *list)
2342 {
2343         struct btrfs_root *reloc_root;
2344
2345         while (!list_empty(list)) {
2346                 reloc_root = list_entry(list->next, struct btrfs_root,
2347                                         root_list);
2348                 __del_reloc_root(reloc_root);
2349                 free_extent_buffer(reloc_root->node);
2350                 free_extent_buffer(reloc_root->commit_root);
2351                 reloc_root->node = NULL;
2352                 reloc_root->commit_root = NULL;
2353         }
2354 }
2355
2356 static noinline_for_stack
2357 void merge_reloc_roots(struct reloc_control *rc)
2358 {
2359         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2360         struct btrfs_root *root;
2361         struct btrfs_root *reloc_root;
2362         LIST_HEAD(reloc_roots);
2363         int found = 0;
2364         int ret = 0;
2365 again:
2366         root = rc->extent_root;
2367
2368         /*
2369          * this serializes us with btrfs_record_root_in_transaction,
2370          * we have to make sure nobody is in the middle of
2371          * adding their roots to the list while we are
2372          * doing this splice
2373          */
2374         mutex_lock(&fs_info->reloc_mutex);
2375         list_splice_init(&rc->reloc_roots, &reloc_roots);
2376         mutex_unlock(&fs_info->reloc_mutex);
2377
2378         while (!list_empty(&reloc_roots)) {
2379                 found = 1;
2380                 reloc_root = list_entry(reloc_roots.next,
2381                                         struct btrfs_root, root_list);
2382
2383                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2384                         root = read_fs_root(fs_info,
2385                                             reloc_root->root_key.offset);
2386                         BUG_ON(IS_ERR(root));
2387                         BUG_ON(root->reloc_root != reloc_root);
2388
2389                         ret = merge_reloc_root(rc, root);
2390                         if (ret) {
2391                                 if (list_empty(&reloc_root->root_list))
2392                                         list_add_tail(&reloc_root->root_list,
2393                                                       &reloc_roots);
2394                                 goto out;
2395                         }
2396                 } else {
2397                         list_del_init(&reloc_root->root_list);
2398                 }
2399
2400                 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2401                 if (ret < 0) {
2402                         if (list_empty(&reloc_root->root_list))
2403                                 list_add_tail(&reloc_root->root_list,
2404                                               &reloc_roots);
2405                         goto out;
2406                 }
2407         }
2408
2409         if (found) {
2410                 found = 0;
2411                 goto again;
2412         }
2413 out:
2414         if (ret) {
2415                 btrfs_handle_fs_error(fs_info, ret, NULL);
2416                 if (!list_empty(&reloc_roots))
2417                         free_reloc_roots(&reloc_roots);
2418
2419                 /* new reloc root may be added */
2420                 mutex_lock(&fs_info->reloc_mutex);
2421                 list_splice_init(&rc->reloc_roots, &reloc_roots);
2422                 mutex_unlock(&fs_info->reloc_mutex);
2423                 if (!list_empty(&reloc_roots))
2424                         free_reloc_roots(&reloc_roots);
2425         }
2426
2427         BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2428 }
2429
2430 static void free_block_list(struct rb_root *blocks)
2431 {
2432         struct tree_block *block;
2433         struct rb_node *rb_node;
2434         while ((rb_node = rb_first(blocks))) {
2435                 block = rb_entry(rb_node, struct tree_block, rb_node);
2436                 rb_erase(rb_node, blocks);
2437                 kfree(block);
2438         }
2439 }
2440
2441 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2442                                       struct btrfs_root *reloc_root)
2443 {
2444         struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2445         struct btrfs_root *root;
2446
2447         if (reloc_root->last_trans == trans->transid)
2448                 return 0;
2449
2450         root = read_fs_root(fs_info, reloc_root->root_key.offset);
2451         BUG_ON(IS_ERR(root));
2452         BUG_ON(root->reloc_root != reloc_root);
2453
2454         return btrfs_record_root_in_trans(trans, root);
2455 }
2456
2457 static noinline_for_stack
2458 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2459                                      struct reloc_control *rc,
2460                                      struct backref_node *node,
2461                                      struct backref_edge *edges[])
2462 {
2463         struct backref_node *next;
2464         struct btrfs_root *root;
2465         int index = 0;
2466
2467         next = node;
2468         while (1) {
2469                 cond_resched();
2470                 next = walk_up_backref(next, edges, &index);
2471                 root = next->root;
2472                 BUG_ON(!root);
2473                 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2474
2475                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2476                         record_reloc_root_in_trans(trans, root);
2477                         break;
2478                 }
2479
2480                 btrfs_record_root_in_trans(trans, root);
2481                 root = root->reloc_root;
2482
2483                 if (next->new_bytenr != root->node->start) {
2484                         BUG_ON(next->new_bytenr);
2485                         BUG_ON(!list_empty(&next->list));
2486                         next->new_bytenr = root->node->start;
2487                         next->root = root;
2488                         list_add_tail(&next->list,
2489                                       &rc->backref_cache.changed);
2490                         __mark_block_processed(rc, next);
2491                         break;
2492                 }
2493
2494                 WARN_ON(1);
2495                 root = NULL;
2496                 next = walk_down_backref(edges, &index);
2497                 if (!next || next->level <= node->level)
2498                         break;
2499         }
2500         if (!root)
2501                 return NULL;
2502
2503         next = node;
2504         /* setup backref node path for btrfs_reloc_cow_block */
2505         while (1) {
2506                 rc->backref_cache.path[next->level] = next;
2507                 if (--index < 0)
2508                         break;
2509                 next = edges[index]->node[UPPER];
2510         }
2511         return root;
2512 }
2513
2514 /*
2515  * select a tree root for relocation. return NULL if the block
2516  * is reference counted. we should use do_relocation() in this
2517  * case. return a tree root pointer if the block isn't reference
2518  * counted. return -ENOENT if the block is root of reloc tree.
2519  */
2520 static noinline_for_stack
2521 struct btrfs_root *select_one_root(struct backref_node *node)
2522 {
2523         struct backref_node *next;
2524         struct btrfs_root *root;
2525         struct btrfs_root *fs_root = NULL;
2526         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2527         int index = 0;
2528
2529         next = node;
2530         while (1) {
2531                 cond_resched();
2532                 next = walk_up_backref(next, edges, &index);
2533                 root = next->root;
2534                 BUG_ON(!root);
2535
2536                 /* no other choice for non-references counted tree */
2537                 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2538                         return root;
2539
2540                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2541                         fs_root = root;
2542
2543                 if (next != node)
2544                         return NULL;
2545
2546                 next = walk_down_backref(edges, &index);
2547                 if (!next || next->level <= node->level)
2548                         break;
2549         }
2550
2551         if (!fs_root)
2552                 return ERR_PTR(-ENOENT);
2553         return fs_root;
2554 }
2555
2556 static noinline_for_stack
2557 u64 calcu_metadata_size(struct reloc_control *rc,
2558                         struct backref_node *node, int reserve)
2559 {
2560         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2561         struct backref_node *next = node;
2562         struct backref_edge *edge;
2563         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2564         u64 num_bytes = 0;
2565         int index = 0;
2566
2567         BUG_ON(reserve && node->processed);
2568
2569         while (next) {
2570                 cond_resched();
2571                 while (1) {
2572                         if (next->processed && (reserve || next != node))
2573                                 break;
2574
2575                         num_bytes += fs_info->nodesize;
2576
2577                         if (list_empty(&next->upper))
2578                                 break;
2579
2580                         edge = list_entry(next->upper.next,
2581                                           struct backref_edge, list[LOWER]);
2582                         edges[index++] = edge;
2583                         next = edge->node[UPPER];
2584                 }
2585                 next = walk_down_backref(edges, &index);
2586         }
2587         return num_bytes;
2588 }
2589
2590 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2591                                   struct reloc_control *rc,
2592                                   struct backref_node *node)
2593 {
2594         struct btrfs_root *root = rc->extent_root;
2595         struct btrfs_fs_info *fs_info = root->fs_info;
2596         u64 num_bytes;
2597         int ret;
2598         u64 tmp;
2599
2600         num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2601
2602         trans->block_rsv = rc->block_rsv;
2603         rc->reserved_bytes += num_bytes;
2604
2605         /*
2606          * We are under a transaction here so we can only do limited flushing.
2607          * If we get an enospc just kick back -EAGAIN so we know to drop the
2608          * transaction and try to refill when we can flush all the things.
2609          */
2610         ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2611                                 BTRFS_RESERVE_FLUSH_LIMIT);
2612         if (ret) {
2613                 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2614                 while (tmp <= rc->reserved_bytes)
2615                         tmp <<= 1;
2616                 /*
2617                  * only one thread can access block_rsv at this point,
2618                  * so we don't need hold lock to protect block_rsv.
2619                  * we expand more reservation size here to allow enough
2620                  * space for relocation and we will return eailer in
2621                  * enospc case.
2622                  */
2623                 rc->block_rsv->size = tmp + fs_info->nodesize *
2624                                       RELOCATION_RESERVED_NODES;
2625                 return -EAGAIN;
2626         }
2627
2628         return 0;
2629 }
2630
2631 /*
2632  * relocate a block tree, and then update pointers in upper level
2633  * blocks that reference the block to point to the new location.
2634  *
2635  * if called by link_to_upper, the block has already been relocated.
2636  * in that case this function just updates pointers.
2637  */
2638 static int do_relocation(struct btrfs_trans_handle *trans,
2639                          struct reloc_control *rc,
2640                          struct backref_node *node,
2641                          struct btrfs_key *key,
2642                          struct btrfs_path *path, int lowest)
2643 {
2644         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2645         struct backref_node *upper;
2646         struct backref_edge *edge;
2647         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2648         struct btrfs_root *root;
2649         struct extent_buffer *eb;
2650         u32 blocksize;
2651         u64 bytenr;
2652         u64 generation;
2653         int slot;
2654         int ret;
2655         int err = 0;
2656
2657         BUG_ON(lowest && node->eb);
2658
2659         path->lowest_level = node->level + 1;
2660         rc->backref_cache.path[node->level] = node;
2661         list_for_each_entry(edge, &node->upper, list[LOWER]) {
2662                 struct btrfs_key first_key;
2663
2664                 cond_resched();
2665
2666                 upper = edge->node[UPPER];
2667                 root = select_reloc_root(trans, rc, upper, edges);
2668                 BUG_ON(!root);
2669
2670                 if (upper->eb && !upper->locked) {
2671                         if (!lowest) {
2672                                 ret = btrfs_bin_search(upper->eb, key,
2673                                                        upper->level, &slot);
2674                                 BUG_ON(ret);
2675                                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2676                                 if (node->eb->start == bytenr)
2677                                         goto next;
2678                         }
2679                         drop_node_buffer(upper);
2680                 }
2681
2682                 if (!upper->eb) {
2683                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2684                         if (ret) {
2685                                 if (ret < 0)
2686                                         err = ret;
2687                                 else
2688                                         err = -ENOENT;
2689
2690                                 btrfs_release_path(path);
2691                                 break;
2692                         }
2693
2694                         if (!upper->eb) {
2695                                 upper->eb = path->nodes[upper->level];
2696                                 path->nodes[upper->level] = NULL;
2697                         } else {
2698                                 BUG_ON(upper->eb != path->nodes[upper->level]);
2699                         }
2700
2701                         upper->locked = 1;
2702                         path->locks[upper->level] = 0;
2703
2704                         slot = path->slots[upper->level];
2705                         btrfs_release_path(path);
2706                 } else {
2707                         ret = btrfs_bin_search(upper->eb, key, upper->level,
2708                                                &slot);
2709                         BUG_ON(ret);
2710                 }
2711
2712                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2713                 if (lowest) {
2714                         if (bytenr != node->bytenr) {
2715                                 btrfs_err(root->fs_info,
2716                 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2717                                           bytenr, node->bytenr, slot,
2718                                           upper->eb->start);
2719                                 err = -EIO;
2720                                 goto next;
2721                         }
2722                 } else {
2723                         if (node->eb->start == bytenr)
2724                                 goto next;
2725                 }
2726
2727                 blocksize = root->fs_info->nodesize;
2728                 generation = btrfs_node_ptr_generation(upper->eb, slot);
2729                 btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
2730                 eb = read_tree_block(fs_info, bytenr, generation,
2731                                      upper->level - 1, &first_key);
2732                 if (IS_ERR(eb)) {
2733                         err = PTR_ERR(eb);
2734                         goto next;
2735                 } else if (!extent_buffer_uptodate(eb)) {
2736                         free_extent_buffer(eb);
2737                         err = -EIO;
2738                         goto next;
2739                 }
2740                 btrfs_tree_lock(eb);
2741                 btrfs_set_lock_blocking(eb);
2742
2743                 if (!node->eb) {
2744                         ret = btrfs_cow_block(trans, root, eb, upper->eb,
2745                                               slot, &eb);
2746                         btrfs_tree_unlock(eb);
2747                         free_extent_buffer(eb);
2748                         if (ret < 0) {
2749                                 err = ret;
2750                                 goto next;
2751                         }
2752                         BUG_ON(node->eb != eb);
2753                 } else {
2754                         btrfs_set_node_blockptr(upper->eb, slot,
2755                                                 node->eb->start);
2756                         btrfs_set_node_ptr_generation(upper->eb, slot,
2757                                                       trans->transid);
2758                         btrfs_mark_buffer_dirty(upper->eb);
2759
2760                         ret = btrfs_inc_extent_ref(trans, root,
2761                                                 node->eb->start, blocksize,
2762                                                 upper->eb->start,
2763                                                 btrfs_header_owner(upper->eb),
2764                                                 node->level, 0);
2765                         BUG_ON(ret);
2766
2767                         ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2768                         BUG_ON(ret);
2769                 }
2770 next:
2771                 if (!upper->pending)
2772                         drop_node_buffer(upper);
2773                 else
2774                         unlock_node_buffer(upper);
2775                 if (err)
2776                         break;
2777         }
2778
2779         if (!err && node->pending) {
2780                 drop_node_buffer(node);
2781                 list_move_tail(&node->list, &rc->backref_cache.changed);
2782                 node->pending = 0;
2783         }
2784
2785         path->lowest_level = 0;
2786         BUG_ON(err == -ENOSPC);
2787         return err;
2788 }
2789
2790 static int link_to_upper(struct btrfs_trans_handle *trans,
2791                          struct reloc_control *rc,
2792                          struct backref_node *node,
2793                          struct btrfs_path *path)
2794 {
2795         struct btrfs_key key;
2796
2797         btrfs_node_key_to_cpu(node->eb, &key, 0);
2798         return do_relocation(trans, rc, node, &key, path, 0);
2799 }
2800
2801 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2802                                 struct reloc_control *rc,
2803                                 struct btrfs_path *path, int err)
2804 {
2805         LIST_HEAD(list);
2806         struct backref_cache *cache = &rc->backref_cache;
2807         struct backref_node *node;
2808         int level;
2809         int ret;
2810
2811         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2812                 while (!list_empty(&cache->pending[level])) {
2813                         node = list_entry(cache->pending[level].next,
2814                                           struct backref_node, list);
2815                         list_move_tail(&node->list, &list);
2816                         BUG_ON(!node->pending);
2817
2818                         if (!err) {
2819                                 ret = link_to_upper(trans, rc, node, path);
2820                                 if (ret < 0)
2821                                         err = ret;
2822                         }
2823                 }
2824                 list_splice_init(&list, &cache->pending[level]);
2825         }
2826         return err;
2827 }
2828
2829 static void mark_block_processed(struct reloc_control *rc,
2830                                  u64 bytenr, u32 blocksize)
2831 {
2832         set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2833                         EXTENT_DIRTY);
2834 }
2835
2836 static void __mark_block_processed(struct reloc_control *rc,
2837                                    struct backref_node *node)
2838 {
2839         u32 blocksize;
2840         if (node->level == 0 ||
2841             in_block_group(node->bytenr, rc->block_group)) {
2842                 blocksize = rc->extent_root->fs_info->nodesize;
2843                 mark_block_processed(rc, node->bytenr, blocksize);
2844         }
2845         node->processed = 1;
2846 }
2847
2848 /*
2849  * mark a block and all blocks directly/indirectly reference the block
2850  * as processed.
2851  */
2852 static void update_processed_blocks(struct reloc_control *rc,
2853                                     struct backref_node *node)
2854 {
2855         struct backref_node *next = node;
2856         struct backref_edge *edge;
2857         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2858         int index = 0;
2859
2860         while (next) {
2861                 cond_resched();
2862                 while (1) {
2863                         if (next->processed)
2864                                 break;
2865
2866                         __mark_block_processed(rc, next);
2867
2868                         if (list_empty(&next->upper))
2869                                 break;
2870
2871                         edge = list_entry(next->upper.next,
2872                                           struct backref_edge, list[LOWER]);
2873                         edges[index++] = edge;
2874                         next = edge->node[UPPER];
2875                 }
2876                 next = walk_down_backref(edges, &index);
2877         }
2878 }
2879
2880 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2881 {
2882         u32 blocksize = rc->extent_root->fs_info->nodesize;
2883
2884         if (test_range_bit(&rc->processed_blocks, bytenr,
2885                            bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2886                 return 1;
2887         return 0;
2888 }
2889
2890 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2891                               struct tree_block *block)
2892 {
2893         struct extent_buffer *eb;
2894
2895         BUG_ON(block->key_ready);
2896         eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
2897                              block->level, NULL);
2898         if (IS_ERR(eb)) {
2899                 return PTR_ERR(eb);
2900         } else if (!extent_buffer_uptodate(eb)) {
2901                 free_extent_buffer(eb);
2902                 return -EIO;
2903         }
2904         WARN_ON(btrfs_header_level(eb) != block->level);
2905         if (block->level == 0)
2906                 btrfs_item_key_to_cpu(eb, &block->key, 0);
2907         else
2908                 btrfs_node_key_to_cpu(eb, &block->key, 0);
2909         free_extent_buffer(eb);
2910         block->key_ready = 1;
2911         return 0;
2912 }
2913
2914 /*
2915  * helper function to relocate a tree block
2916  */
2917 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2918                                 struct reloc_control *rc,
2919                                 struct backref_node *node,
2920                                 struct btrfs_key *key,
2921                                 struct btrfs_path *path)
2922 {
2923         struct btrfs_root *root;
2924         int ret = 0;
2925
2926         if (!node)
2927                 return 0;
2928
2929         BUG_ON(node->processed);
2930         root = select_one_root(node);
2931         if (root == ERR_PTR(-ENOENT)) {
2932                 update_processed_blocks(rc, node);
2933                 goto out;
2934         }
2935
2936         if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2937                 ret = reserve_metadata_space(trans, rc, node);
2938                 if (ret)
2939                         goto out;
2940         }
2941
2942         if (root) {
2943                 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2944                         BUG_ON(node->new_bytenr);
2945                         BUG_ON(!list_empty(&node->list));
2946                         btrfs_record_root_in_trans(trans, root);
2947                         root = root->reloc_root;
2948                         node->new_bytenr = root->node->start;
2949                         node->root = root;
2950                         list_add_tail(&node->list, &rc->backref_cache.changed);
2951                 } else {
2952                         path->lowest_level = node->level;
2953                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2954                         btrfs_release_path(path);
2955                         if (ret > 0)
2956                                 ret = 0;
2957                 }
2958                 if (!ret)
2959                         update_processed_blocks(rc, node);
2960         } else {
2961                 ret = do_relocation(trans, rc, node, key, path, 1);
2962         }
2963 out:
2964         if (ret || node->level == 0 || node->cowonly)
2965                 remove_backref_node(&rc->backref_cache, node);
2966         return ret;
2967 }
2968
2969 /*
2970  * relocate a list of blocks
2971  */
2972 static noinline_for_stack
2973 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2974                          struct reloc_control *rc, struct rb_root *blocks)
2975 {
2976         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2977         struct backref_node *node;
2978         struct btrfs_path *path;
2979         struct tree_block *block;
2980         struct rb_node *rb_node;
2981         int ret;
2982         int err = 0;
2983
2984         path = btrfs_alloc_path();
2985         if (!path) {
2986                 err = -ENOMEM;
2987                 goto out_free_blocks;
2988         }
2989
2990         rb_node = rb_first(blocks);
2991         while (rb_node) {
2992                 block = rb_entry(rb_node, struct tree_block, rb_node);
2993                 if (!block->key_ready)
2994                         readahead_tree_block(fs_info, block->bytenr);
2995                 rb_node = rb_next(rb_node);
2996         }
2997
2998         rb_node = rb_first(blocks);
2999         while (rb_node) {
3000                 block = rb_entry(rb_node, struct tree_block, rb_node);
3001                 if (!block->key_ready) {
3002                         err = get_tree_block_key(fs_info, block);
3003                         if (err)
3004                                 goto out_free_path;
3005                 }
3006                 rb_node = rb_next(rb_node);
3007         }
3008
3009         rb_node = rb_first(blocks);
3010         while (rb_node) {
3011                 block = rb_entry(rb_node, struct tree_block, rb_node);
3012
3013                 node = build_backref_tree(rc, &block->key,
3014                                           block->level, block->bytenr);
3015                 if (IS_ERR(node)) {
3016                         err = PTR_ERR(node);
3017                         goto out;
3018                 }
3019
3020                 ret = relocate_tree_block(trans, rc, node, &block->key,
3021                                           path);
3022                 if (ret < 0) {
3023                         if (ret != -EAGAIN || rb_node == rb_first(blocks))
3024                                 err = ret;
3025                         goto out;
3026                 }
3027                 rb_node = rb_next(rb_node);
3028         }
3029 out:
3030         err = finish_pending_nodes(trans, rc, path, err);
3031
3032 out_free_path:
3033         btrfs_free_path(path);
3034 out_free_blocks:
3035         free_block_list(blocks);
3036         return err;
3037 }
3038
3039 static noinline_for_stack
3040 int prealloc_file_extent_cluster(struct inode *inode,
3041                                  struct file_extent_cluster *cluster)
3042 {
3043         u64 alloc_hint = 0;
3044         u64 start;
3045         u64 end;
3046         u64 offset = BTRFS_I(inode)->index_cnt;
3047         u64 num_bytes;
3048         int nr = 0;
3049         int ret = 0;
3050         u64 prealloc_start = cluster->start - offset;
3051         u64 prealloc_end = cluster->end - offset;
3052         u64 cur_offset;
3053         struct extent_changeset *data_reserved = NULL;
3054
3055         BUG_ON(cluster->start != cluster->boundary[0]);
3056         inode_lock(inode);
3057
3058         ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
3059                                           prealloc_end + 1 - prealloc_start);
3060         if (ret)
3061                 goto out;
3062
3063         cur_offset = prealloc_start;
3064         while (nr < cluster->nr) {
3065                 start = cluster->boundary[nr] - offset;
3066                 if (nr + 1 < cluster->nr)
3067                         end = cluster->boundary[nr + 1] - 1 - offset;
3068                 else
3069                         end = cluster->end - offset;
3070
3071                 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3072                 num_bytes = end + 1 - start;
3073                 if (cur_offset < start)
3074                         btrfs_free_reserved_data_space(inode, data_reserved,
3075                                         cur_offset, start - cur_offset);
3076                 ret = btrfs_prealloc_file_range(inode, 0, start,
3077                                                 num_bytes, num_bytes,
3078                                                 end + 1, &alloc_hint);
3079                 cur_offset = end + 1;
3080                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3081                 if (ret)
3082                         break;
3083                 nr++;
3084         }
3085         if (cur_offset < prealloc_end)
3086                 btrfs_free_reserved_data_space(inode, data_reserved,
3087                                 cur_offset, prealloc_end + 1 - cur_offset);
3088 out:
3089         inode_unlock(inode);
3090         extent_changeset_free(data_reserved);
3091         return ret;
3092 }
3093
3094 static noinline_for_stack
3095 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3096                          u64 block_start)
3097 {
3098         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3099         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3100         struct extent_map *em;
3101         int ret = 0;
3102
3103         em = alloc_extent_map();
3104         if (!em)
3105                 return -ENOMEM;
3106
3107         em->start = start;
3108         em->len = end + 1 - start;
3109         em->block_len = em->len;
3110         em->block_start = block_start;
3111         em->bdev = fs_info->fs_devices->latest_bdev;
3112         set_bit(EXTENT_FLAG_PINNED, &em->flags);
3113
3114         lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3115         while (1) {
3116                 write_lock(&em_tree->lock);
3117                 ret = add_extent_mapping(em_tree, em, 0);
3118                 write_unlock(&em_tree->lock);
3119                 if (ret != -EEXIST) {
3120                         free_extent_map(em);
3121                         break;
3122                 }
3123                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3124         }
3125         unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3126         return ret;
3127 }
3128
3129 static int relocate_file_extent_cluster(struct inode *inode,
3130                                         struct file_extent_cluster *cluster)
3131 {
3132         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3133         u64 page_start;
3134         u64 page_end;
3135         u64 offset = BTRFS_I(inode)->index_cnt;
3136         unsigned long index;
3137         unsigned long last_index;
3138         struct page *page;
3139         struct file_ra_state *ra;
3140         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3141         int nr = 0;
3142         int ret = 0;
3143
3144         if (!cluster->nr)
3145                 return 0;
3146
3147         ra = kzalloc(sizeof(*ra), GFP_NOFS);
3148         if (!ra)
3149                 return -ENOMEM;
3150
3151         ret = prealloc_file_extent_cluster(inode, cluster);
3152         if (ret)
3153                 goto out;
3154
3155         file_ra_state_init(ra, inode->i_mapping);
3156
3157         ret = setup_extent_mapping(inode, cluster->start - offset,
3158                                    cluster->end - offset, cluster->start);
3159         if (ret)
3160                 goto out;
3161
3162         index = (cluster->start - offset) >> PAGE_SHIFT;
3163         last_index = (cluster->end - offset) >> PAGE_SHIFT;
3164         while (index <= last_index) {
3165                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3166                                 PAGE_SIZE);
3167                 if (ret)
3168                         goto out;
3169
3170                 page = find_lock_page(inode->i_mapping, index);
3171                 if (!page) {
3172                         page_cache_sync_readahead(inode->i_mapping,
3173                                                   ra, NULL, index,
3174                                                   last_index + 1 - index);
3175                         page = find_or_create_page(inode->i_mapping, index,
3176                                                    mask);
3177                         if (!page) {
3178                                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3179                                                         PAGE_SIZE, true);
3180                                 ret = -ENOMEM;
3181                                 goto out;
3182                         }
3183                 }
3184
3185                 if (PageReadahead(page)) {
3186                         page_cache_async_readahead(inode->i_mapping,
3187                                                    ra, NULL, page, index,
3188                                                    last_index + 1 - index);
3189                 }
3190
3191                 if (!PageUptodate(page)) {
3192                         btrfs_readpage(NULL, page);
3193                         lock_page(page);
3194                         if (!PageUptodate(page)) {
3195                                 unlock_page(page);
3196                                 put_page(page);
3197                                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3198                                                         PAGE_SIZE, true);
3199                                 btrfs_delalloc_release_extents(BTRFS_I(inode),
3200                                                                PAGE_SIZE, true);
3201                                 ret = -EIO;
3202                                 goto out;
3203                         }
3204                 }
3205
3206                 page_start = page_offset(page);
3207                 page_end = page_start + PAGE_SIZE - 1;
3208
3209                 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3210
3211                 set_page_extent_mapped(page);
3212
3213                 if (nr < cluster->nr &&
3214                     page_start + offset == cluster->boundary[nr]) {
3215                         set_extent_bits(&BTRFS_I(inode)->io_tree,
3216                                         page_start, page_end,
3217                                         EXTENT_BOUNDARY);
3218                         nr++;
3219                 }
3220
3221                 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
3222                                                 NULL, 0);
3223                 if (ret) {
3224                         unlock_page(page);
3225                         put_page(page);
3226                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
3227                                                          PAGE_SIZE, true);
3228                         btrfs_delalloc_release_extents(BTRFS_I(inode),
3229                                                        PAGE_SIZE, true);
3230
3231                         clear_extent_bits(&BTRFS_I(inode)->io_tree,
3232                                           page_start, page_end,
3233                                           EXTENT_LOCKED | EXTENT_BOUNDARY);
3234                         goto out;
3235
3236                 }
3237                 set_page_dirty(page);
3238
3239                 unlock_extent(&BTRFS_I(inode)->io_tree,
3240                               page_start, page_end);
3241                 unlock_page(page);
3242                 put_page(page);
3243
3244                 index++;
3245                 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
3246                                                false);
3247                 balance_dirty_pages_ratelimited(inode->i_mapping);
3248                 btrfs_throttle(fs_info);
3249         }
3250         WARN_ON(nr != cluster->nr);
3251 out:
3252         kfree(ra);
3253         return ret;
3254 }
3255
3256 static noinline_for_stack
3257 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3258                          struct file_extent_cluster *cluster)
3259 {
3260         int ret;
3261
3262         if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3263                 ret = relocate_file_extent_cluster(inode, cluster);
3264                 if (ret)
3265                         return ret;
3266                 cluster->nr = 0;
3267         }
3268
3269         if (!cluster->nr)
3270                 cluster->start = extent_key->objectid;
3271         else
3272                 BUG_ON(cluster->nr >= MAX_EXTENTS);
3273         cluster->end = extent_key->objectid + extent_key->offset - 1;
3274         cluster->boundary[cluster->nr] = extent_key->objectid;
3275         cluster->nr++;
3276
3277         if (cluster->nr >= MAX_EXTENTS) {
3278                 ret = relocate_file_extent_cluster(inode, cluster);
3279                 if (ret)
3280                         return ret;
3281                 cluster->nr = 0;
3282         }
3283         return 0;
3284 }
3285
3286 /*
3287  * helper to add a tree block to the list.
3288  * the major work is getting the generation and level of the block
3289  */
3290 static int add_tree_block(struct reloc_control *rc,
3291                           struct btrfs_key *extent_key,
3292                           struct btrfs_path *path,
3293                           struct rb_root *blocks)
3294 {
3295         struct extent_buffer *eb;
3296         struct btrfs_extent_item *ei;
3297         struct btrfs_tree_block_info *bi;
3298         struct tree_block *block;
3299         struct rb_node *rb_node;
3300         u32 item_size;
3301         int level = -1;
3302         u64 generation;
3303
3304         eb =  path->nodes[0];
3305         item_size = btrfs_item_size_nr(eb, path->slots[0]);
3306
3307         if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3308             item_size >= sizeof(*ei) + sizeof(*bi)) {
3309                 ei = btrfs_item_ptr(eb, path->slots[0],
3310                                 struct btrfs_extent_item);
3311                 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3312                         bi = (struct btrfs_tree_block_info *)(ei + 1);
3313                         level = btrfs_tree_block_level(eb, bi);
3314                 } else {
3315                         level = (int)extent_key->offset;
3316                 }
3317                 generation = btrfs_extent_generation(eb, ei);
3318         } else {
3319                 BUG();
3320         }
3321
3322         btrfs_release_path(path);
3323
3324         BUG_ON(level == -1);
3325
3326         block = kmalloc(sizeof(*block), GFP_NOFS);
3327         if (!block)
3328                 return -ENOMEM;
3329
3330         block->bytenr = extent_key->objectid;
3331         block->key.objectid = rc->extent_root->fs_info->nodesize;
3332         block->key.offset = generation;
3333         block->level = level;
3334         block->key_ready = 0;
3335
3336         rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3337         if (rb_node)
3338                 backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3339
3340         return 0;
3341 }
3342
3343 /*
3344  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3345  */
3346 static int __add_tree_block(struct reloc_control *rc,
3347                             u64 bytenr, u32 blocksize,
3348                             struct rb_root *blocks)
3349 {
3350         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3351         struct btrfs_path *path;
3352         struct btrfs_key key;
3353         int ret;
3354         bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3355
3356         if (tree_block_processed(bytenr, rc))
3357                 return 0;
3358
3359         if (tree_search(blocks, bytenr))
3360                 return 0;
3361
3362         path = btrfs_alloc_path();
3363         if (!path)
3364                 return -ENOMEM;
3365 again:
3366         key.objectid = bytenr;
3367         if (skinny) {
3368                 key.type = BTRFS_METADATA_ITEM_KEY;
3369                 key.offset = (u64)-1;
3370         } else {
3371                 key.type = BTRFS_EXTENT_ITEM_KEY;
3372                 key.offset = blocksize;
3373         }
3374
3375         path->search_commit_root = 1;
3376         path->skip_locking = 1;
3377         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3378         if (ret < 0)
3379                 goto out;
3380
3381         if (ret > 0 && skinny) {
3382                 if (path->slots[0]) {
3383                         path->slots[0]--;
3384                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3385                                               path->slots[0]);
3386                         if (key.objectid == bytenr &&
3387                             (key.type == BTRFS_METADATA_ITEM_KEY ||
3388                              (key.type == BTRFS_EXTENT_ITEM_KEY &&
3389                               key.offset == blocksize)))
3390                                 ret = 0;
3391                 }
3392
3393                 if (ret) {
3394                         skinny = false;
3395                         btrfs_release_path(path);
3396                         goto again;
3397                 }
3398         }
3399         if (ret) {
3400                 ASSERT(ret == 1);
3401                 btrfs_print_leaf(path->nodes[0]);
3402                 btrfs_err(fs_info,
3403              "tree block extent item (%llu) is not found in extent tree",
3404                      bytenr);
3405                 WARN_ON(1);
3406                 ret = -EINVAL;
3407                 goto out;
3408         }
3409
3410         ret = add_tree_block(rc, &key, path, blocks);
3411 out:
3412         btrfs_free_path(path);
3413         return ret;
3414 }
3415
3416 /*
3417  * helper to check if the block use full backrefs for pointers in it
3418  */
3419 static int block_use_full_backref(struct reloc_control *rc,
3420                                   struct extent_buffer *eb)
3421 {
3422         u64 flags;
3423         int ret;
3424
3425         if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3426             btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3427                 return 1;
3428
3429         ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
3430                                        eb->start, btrfs_header_level(eb), 1,
3431                                        NULL, &flags);
3432         BUG_ON(ret);
3433
3434         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3435                 ret = 1;
3436         else
3437                 ret = 0;
3438         return ret;
3439 }
3440
3441 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3442                                     struct btrfs_block_group_cache *block_group,
3443                                     struct inode *inode,
3444                                     u64 ino)
3445 {
3446         struct btrfs_key key;
3447         struct btrfs_root *root = fs_info->tree_root;
3448         struct btrfs_trans_handle *trans;
3449         int ret = 0;
3450
3451         if (inode)
3452                 goto truncate;
3453
3454         key.objectid = ino;
3455         key.type = BTRFS_INODE_ITEM_KEY;
3456         key.offset = 0;
3457
3458         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3459         if (IS_ERR(inode) || is_bad_inode(inode)) {
3460                 if (!IS_ERR(inode))
3461                         iput(inode);
3462                 return -ENOENT;
3463         }
3464
3465 truncate:
3466         ret = btrfs_check_trunc_cache_free_space(fs_info,
3467                                                  &fs_info->global_block_rsv);
3468         if (ret)
3469                 goto out;
3470
3471         trans = btrfs_join_transaction(root);
3472         if (IS_ERR(trans)) {
3473                 ret = PTR_ERR(trans);
3474                 goto out;
3475         }
3476
3477         ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3478
3479         btrfs_end_transaction(trans);
3480         btrfs_btree_balance_dirty(fs_info);
3481 out:
3482         iput(inode);
3483         return ret;
3484 }
3485
3486 /*
3487  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3488  * this function scans fs tree to find blocks reference the data extent
3489  */
3490 static int find_data_references(struct reloc_control *rc,
3491                                 struct btrfs_key *extent_key,
3492                                 struct extent_buffer *leaf,
3493                                 struct btrfs_extent_data_ref *ref,
3494                                 struct rb_root *blocks)
3495 {
3496         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3497         struct btrfs_path *path;
3498         struct tree_block *block;
3499         struct btrfs_root *root;
3500         struct btrfs_file_extent_item *fi;
3501         struct rb_node *rb_node;
3502         struct btrfs_key key;
3503         u64 ref_root;
3504         u64 ref_objectid;
3505         u64 ref_offset;
3506         u32 ref_count;
3507         u32 nritems;
3508         int err = 0;
3509         int added = 0;
3510         int counted;
3511         int ret;
3512
3513         ref_root = btrfs_extent_data_ref_root(leaf, ref);
3514         ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3515         ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3516         ref_count = btrfs_extent_data_ref_count(leaf, ref);
3517
3518         /*
3519          * This is an extent belonging to the free space cache, lets just delete
3520          * it and redo the search.
3521          */
3522         if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3523                 ret = delete_block_group_cache(fs_info, rc->block_group,
3524                                                NULL, ref_objectid);
3525                 if (ret != -ENOENT)
3526                         return ret;
3527                 ret = 0;
3528         }
3529
3530         path = btrfs_alloc_path();
3531         if (!path)
3532                 return -ENOMEM;
3533         path->reada = READA_FORWARD;
3534
3535         root = read_fs_root(fs_info, ref_root);
3536         if (IS_ERR(root)) {
3537                 err = PTR_ERR(root);
3538                 goto out;
3539         }
3540
3541         key.objectid = ref_objectid;
3542         key.type = BTRFS_EXTENT_DATA_KEY;
3543         if (ref_offset > ((u64)-1 << 32))
3544                 key.offset = 0;
3545         else
3546                 key.offset = ref_offset;
3547
3548         path->search_commit_root = 1;
3549         path->skip_locking = 1;
3550         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3551         if (ret < 0) {
3552                 err = ret;
3553                 goto out;
3554         }
3555
3556         leaf = path->nodes[0];
3557         nritems = btrfs_header_nritems(leaf);
3558         /*
3559          * the references in tree blocks that use full backrefs
3560          * are not counted in
3561          */
3562         if (block_use_full_backref(rc, leaf))
3563                 counted = 0;
3564         else
3565                 counted = 1;
3566         rb_node = tree_search(blocks, leaf->start);
3567         if (rb_node) {
3568                 if (counted)
3569                         added = 1;
3570                 else
3571                         path->slots[0] = nritems;
3572         }
3573
3574         while (ref_count > 0) {
3575                 while (path->slots[0] >= nritems) {
3576                         ret = btrfs_next_leaf(root, path);
3577                         if (ret < 0) {
3578                                 err = ret;
3579                                 goto out;
3580                         }
3581                         if (WARN_ON(ret > 0))
3582                                 goto out;
3583
3584                         leaf = path->nodes[0];
3585                         nritems = btrfs_header_nritems(leaf);
3586                         added = 0;
3587
3588                         if (block_use_full_backref(rc, leaf))
3589                                 counted = 0;
3590                         else
3591                                 counted = 1;
3592                         rb_node = tree_search(blocks, leaf->start);
3593                         if (rb_node) {
3594                                 if (counted)
3595                                         added = 1;
3596                                 else
3597                                         path->slots[0] = nritems;
3598                         }
3599                 }
3600
3601                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3602                 if (WARN_ON(key.objectid != ref_objectid ||
3603                     key.type != BTRFS_EXTENT_DATA_KEY))
3604                         break;
3605
3606                 fi = btrfs_item_ptr(leaf, path->slots[0],
3607                                     struct btrfs_file_extent_item);
3608
3609                 if (btrfs_file_extent_type(leaf, fi) ==
3610                     BTRFS_FILE_EXTENT_INLINE)
3611                         goto next;
3612
3613                 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3614                     extent_key->objectid)
3615                         goto next;
3616
3617                 key.offset -= btrfs_file_extent_offset(leaf, fi);
3618                 if (key.offset != ref_offset)
3619                         goto next;
3620
3621                 if (counted)
3622                         ref_count--;
3623                 if (added)
3624                         goto next;
3625
3626                 if (!tree_block_processed(leaf->start, rc)) {
3627                         block = kmalloc(sizeof(*block), GFP_NOFS);
3628                         if (!block) {
3629                                 err = -ENOMEM;
3630                                 break;
3631                         }
3632                         block->bytenr = leaf->start;
3633                         btrfs_item_key_to_cpu(leaf, &block->key, 0);
3634                         block->level = 0;
3635                         block->key_ready = 1;
3636                         rb_node = tree_insert(blocks, block->bytenr,
3637                                               &block->rb_node);
3638                         if (rb_node)
3639                                 backref_tree_panic(rb_node, -EEXIST,
3640                                                    block->bytenr);
3641                 }
3642                 if (counted)
3643                         added = 1;
3644                 else
3645                         path->slots[0] = nritems;
3646 next:
3647                 path->slots[0]++;
3648
3649         }
3650 out:
3651         btrfs_free_path(path);
3652         return err;
3653 }
3654
3655 /*
3656  * helper to find all tree blocks that reference a given data extent
3657  */
3658 static noinline_for_stack
3659 int add_data_references(struct reloc_control *rc,
3660                         struct btrfs_key *extent_key,
3661                         struct btrfs_path *path,
3662                         struct rb_root *blocks)
3663 {
3664         struct btrfs_key key;
3665         struct extent_buffer *eb;
3666         struct btrfs_extent_data_ref *dref;
3667         struct btrfs_extent_inline_ref *iref;
3668         unsigned long ptr;
3669         unsigned long end;
3670         u32 blocksize = rc->extent_root->fs_info->nodesize;
3671         int ret = 0;
3672         int err = 0;
3673
3674         eb = path->nodes[0];
3675         ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3676         end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3677         ptr += sizeof(struct btrfs_extent_item);
3678
3679         while (ptr < end) {
3680                 iref = (struct btrfs_extent_inline_ref *)ptr;
3681                 key.type = btrfs_get_extent_inline_ref_type(eb, iref,
3682                                                         BTRFS_REF_TYPE_DATA);
3683                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3684                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3685                         ret = __add_tree_block(rc, key.offset, blocksize,
3686                                                blocks);
3687                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3688                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3689                         ret = find_data_references(rc, extent_key,
3690                                                    eb, dref, blocks);
3691                 } else {
3692                         ret = -EUCLEAN;
3693                         btrfs_err(rc->extent_root->fs_info,
3694                      "extent %llu slot %d has an invalid inline ref type",
3695                              eb->start, path->slots[0]);
3696                 }
3697                 if (ret) {
3698                         err = ret;
3699                         goto out;
3700                 }
3701                 ptr += btrfs_extent_inline_ref_size(key.type);
3702         }
3703         WARN_ON(ptr > end);
3704
3705         while (1) {
3706                 cond_resched();
3707                 eb = path->nodes[0];
3708                 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3709                         ret = btrfs_next_leaf(rc->extent_root, path);
3710                         if (ret < 0) {
3711                                 err = ret;
3712                                 break;
3713                         }
3714                         if (ret > 0)
3715                                 break;
3716                         eb = path->nodes[0];
3717                 }
3718
3719                 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3720                 if (key.objectid != extent_key->objectid)
3721                         break;
3722
3723                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3724                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3725                         ret = __add_tree_block(rc, key.offset, blocksize,
3726                                                blocks);
3727                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3728                         dref = btrfs_item_ptr(eb, path->slots[0],
3729                                               struct btrfs_extent_data_ref);
3730                         ret = find_data_references(rc, extent_key,
3731                                                    eb, dref, blocks);
3732                 } else {
3733                         ret = 0;
3734                 }
3735                 if (ret) {
3736                         err = ret;
3737                         break;
3738                 }
3739                 path->slots[0]++;
3740         }
3741 out:
3742         btrfs_release_path(path);
3743         if (err)
3744                 free_block_list(blocks);
3745         return err;
3746 }
3747
3748 /*
3749  * helper to find next unprocessed extent
3750  */
3751 static noinline_for_stack
3752 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3753                      struct btrfs_key *extent_key)
3754 {
3755         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3756         struct btrfs_key key;
3757         struct extent_buffer *leaf;
3758         u64 start, end, last;
3759         int ret;
3760
3761         last = rc->block_group->key.objectid + rc->block_group->key.offset;
3762         while (1) {
3763                 cond_resched();
3764                 if (rc->search_start >= last) {
3765                         ret = 1;
3766                         break;
3767                 }
3768
3769                 key.objectid = rc->search_start;
3770                 key.type = BTRFS_EXTENT_ITEM_KEY;
3771                 key.offset = 0;
3772
3773                 path->search_commit_root = 1;
3774                 path->skip_locking = 1;
3775                 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3776                                         0, 0);
3777                 if (ret < 0)
3778                         break;
3779 next:
3780                 leaf = path->nodes[0];
3781                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3782                         ret = btrfs_next_leaf(rc->extent_root, path);
3783                         if (ret != 0)
3784                                 break;
3785                         leaf = path->nodes[0];
3786                 }
3787
3788                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3789                 if (key.objectid >= last) {
3790                         ret = 1;
3791                         break;
3792                 }
3793
3794                 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3795                     key.type != BTRFS_METADATA_ITEM_KEY) {
3796                         path->slots[0]++;
3797                         goto next;
3798                 }
3799
3800                 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3801                     key.objectid + key.offset <= rc->search_start) {
3802                         path->slots[0]++;
3803                         goto next;
3804                 }
3805
3806                 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3807                     key.objectid + fs_info->nodesize <=
3808                     rc->search_start) {
3809                         path->slots[0]++;
3810                         goto next;
3811                 }
3812
3813                 ret = find_first_extent_bit(&rc->processed_blocks,
3814                                             key.objectid, &start, &end,
3815                                             EXTENT_DIRTY, NULL);
3816
3817                 if (ret == 0 && start <= key.objectid) {
3818                         btrfs_release_path(path);
3819                         rc->search_start = end + 1;
3820                 } else {
3821                         if (key.type == BTRFS_EXTENT_ITEM_KEY)
3822                                 rc->search_start = key.objectid + key.offset;
3823                         else
3824                                 rc->search_start = key.objectid +
3825                                         fs_info->nodesize;
3826                         memcpy(extent_key, &key, sizeof(key));
3827                         return 0;
3828                 }
3829         }
3830         btrfs_release_path(path);
3831         return ret;
3832 }
3833
3834 static void set_reloc_control(struct reloc_control *rc)
3835 {
3836         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3837
3838         mutex_lock(&fs_info->reloc_mutex);
3839         fs_info->reloc_ctl = rc;
3840         mutex_unlock(&fs_info->reloc_mutex);
3841 }
3842
3843 static void unset_reloc_control(struct reloc_control *rc)
3844 {
3845         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3846
3847         mutex_lock(&fs_info->reloc_mutex);
3848         fs_info->reloc_ctl = NULL;
3849         mutex_unlock(&fs_info->reloc_mutex);
3850 }
3851
3852 static int check_extent_flags(u64 flags)
3853 {
3854         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3855             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3856                 return 1;
3857         if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3858             !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3859                 return 1;
3860         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3861             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3862                 return 1;
3863         return 0;
3864 }
3865
3866 static noinline_for_stack
3867 int prepare_to_relocate(struct reloc_control *rc)
3868 {
3869         struct btrfs_trans_handle *trans;
3870         int ret;
3871
3872         rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3873                                               BTRFS_BLOCK_RSV_TEMP);
3874         if (!rc->block_rsv)
3875                 return -ENOMEM;
3876
3877         memset(&rc->cluster, 0, sizeof(rc->cluster));
3878         rc->search_start = rc->block_group->key.objectid;
3879         rc->extents_found = 0;
3880         rc->nodes_relocated = 0;
3881         rc->merging_rsv_size = 0;
3882         rc->reserved_bytes = 0;
3883         rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3884                               RELOCATION_RESERVED_NODES;
3885         ret = btrfs_block_rsv_refill(rc->extent_root,
3886                                      rc->block_rsv, rc->block_rsv->size,
3887                                      BTRFS_RESERVE_FLUSH_ALL);
3888         if (ret)
3889                 return ret;
3890
3891         rc->create_reloc_tree = 1;
3892         set_reloc_control(rc);
3893
3894         trans = btrfs_join_transaction(rc->extent_root);
3895         if (IS_ERR(trans)) {
3896                 unset_reloc_control(rc);
3897                 /*
3898                  * extent tree is not a ref_cow tree and has no reloc_root to
3899                  * cleanup.  And callers are responsible to free the above
3900                  * block rsv.
3901                  */
3902                 return PTR_ERR(trans);
3903         }
3904         btrfs_commit_transaction(trans);
3905         return 0;
3906 }
3907
3908 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3909 {
3910         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3911         struct rb_root blocks = RB_ROOT;
3912         struct btrfs_key key;
3913         struct btrfs_trans_handle *trans = NULL;
3914         struct btrfs_path *path;
3915         struct btrfs_extent_item *ei;
3916         u64 flags;
3917         u32 item_size;
3918         int ret;
3919         int err = 0;
3920         int progress = 0;
3921
3922         path = btrfs_alloc_path();
3923         if (!path)
3924                 return -ENOMEM;
3925         path->reada = READA_FORWARD;
3926
3927         ret = prepare_to_relocate(rc);
3928         if (ret) {
3929                 err = ret;
3930                 goto out_free;
3931         }
3932
3933         while (1) {
3934                 rc->reserved_bytes = 0;
3935                 ret = btrfs_block_rsv_refill(rc->extent_root,
3936                                         rc->block_rsv, rc->block_rsv->size,
3937                                         BTRFS_RESERVE_FLUSH_ALL);
3938                 if (ret) {
3939                         err = ret;
3940                         break;
3941                 }
3942                 progress++;
3943                 trans = btrfs_start_transaction(rc->extent_root, 0);
3944                 if (IS_ERR(trans)) {
3945                         err = PTR_ERR(trans);
3946                         trans = NULL;
3947                         break;
3948                 }
3949 restart:
3950                 if (update_backref_cache(trans, &rc->backref_cache)) {
3951                         btrfs_end_transaction(trans);
3952                         continue;
3953                 }
3954
3955                 ret = find_next_extent(rc, path, &key);
3956                 if (ret < 0)
3957                         err = ret;
3958                 if (ret != 0)
3959                         break;
3960
3961                 rc->extents_found++;
3962
3963                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3964                                     struct btrfs_extent_item);
3965                 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3966                 if (item_size >= sizeof(*ei)) {
3967                         flags = btrfs_extent_flags(path->nodes[0], ei);
3968                         ret = check_extent_flags(flags);
3969                         BUG_ON(ret);
3970
3971                 } else {
3972                         BUG();
3973                 }
3974
3975                 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3976                         ret = add_tree_block(rc, &key, path, &blocks);
3977                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3978                            (flags & BTRFS_EXTENT_FLAG_DATA)) {
3979                         ret = add_data_references(rc, &key, path, &blocks);
3980                 } else {
3981                         btrfs_release_path(path);
3982                         ret = 0;
3983                 }
3984                 if (ret < 0) {
3985                         err = ret;
3986                         break;
3987                 }
3988
3989                 if (!RB_EMPTY_ROOT(&blocks)) {
3990                         ret = relocate_tree_blocks(trans, rc, &blocks);
3991                         if (ret < 0) {
3992                                 /*
3993                                  * if we fail to relocate tree blocks, force to update
3994                                  * backref cache when committing transaction.
3995                                  */
3996                                 rc->backref_cache.last_trans = trans->transid - 1;
3997
3998                                 if (ret != -EAGAIN) {
3999                                         err = ret;
4000                                         break;
4001                                 }
4002                                 rc->extents_found--;
4003                                 rc->search_start = key.objectid;
4004                         }
4005                 }
4006
4007                 btrfs_end_transaction_throttle(trans);
4008                 btrfs_btree_balance_dirty(fs_info);
4009                 trans = NULL;
4010
4011                 if (rc->stage == MOVE_DATA_EXTENTS &&
4012                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
4013                         rc->found_file_extent = 1;
4014                         ret = relocate_data_extent(rc->data_inode,
4015                                                    &key, &rc->cluster);
4016                         if (ret < 0) {
4017                                 err = ret;
4018                                 break;
4019                         }
4020                 }
4021         }
4022         if (trans && progress && err == -ENOSPC) {
4023                 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
4024                 if (ret == 1) {
4025                         err = 0;
4026                         progress = 0;
4027                         goto restart;
4028                 }
4029         }
4030
4031         btrfs_release_path(path);
4032         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4033
4034         if (trans) {
4035                 btrfs_end_transaction_throttle(trans);
4036                 btrfs_btree_balance_dirty(fs_info);
4037         }
4038
4039         if (!err) {
4040                 ret = relocate_file_extent_cluster(rc->data_inode,
4041                                                    &rc->cluster);
4042                 if (ret < 0)
4043                         err = ret;
4044         }
4045
4046         rc->create_reloc_tree = 0;
4047         set_reloc_control(rc);
4048
4049         backref_cache_cleanup(&rc->backref_cache);
4050         btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4051
4052         err = prepare_to_merge(rc, err);
4053
4054         merge_reloc_roots(rc);
4055
4056         rc->merge_reloc_tree = 0;
4057         unset_reloc_control(rc);
4058         btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4059
4060         /* get rid of pinned extents */
4061         trans = btrfs_join_transaction(rc->extent_root);
4062         if (IS_ERR(trans)) {
4063                 err = PTR_ERR(trans);
4064                 goto out_free;
4065         }
4066         btrfs_commit_transaction(trans);
4067 out_free:
4068         btrfs_free_block_rsv(fs_info, rc->block_rsv);
4069         btrfs_free_path(path);
4070         return err;
4071 }
4072
4073 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4074                                  struct btrfs_root *root, u64 objectid)
4075 {
4076         struct btrfs_path *path;
4077         struct btrfs_inode_item *item;
4078         struct extent_buffer *leaf;
4079         int ret;
4080
4081         path = btrfs_alloc_path();
4082         if (!path)
4083                 return -ENOMEM;
4084
4085         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4086         if (ret)
4087                 goto out;
4088
4089         leaf = path->nodes[0];
4090         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4091         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
4092         btrfs_set_inode_generation(leaf, item, 1);
4093         btrfs_set_inode_size(leaf, item, 0);
4094         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4095         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4096                                           BTRFS_INODE_PREALLOC);
4097         btrfs_mark_buffer_dirty(leaf);
4098 out:
4099         btrfs_free_path(path);
4100         return ret;
4101 }
4102
4103 /*
4104  * helper to create inode for data relocation.
4105  * the inode is in data relocation tree and its link count is 0
4106  */
4107 static noinline_for_stack
4108 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4109                                  struct btrfs_block_group_cache *group)
4110 {
4111         struct inode *inode = NULL;
4112         struct btrfs_trans_handle *trans;
4113         struct btrfs_root *root;
4114         struct btrfs_key key;
4115         u64 objectid;
4116         int err = 0;
4117
4118         root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4119         if (IS_ERR(root))
4120                 return ERR_CAST(root);
4121
4122         trans = btrfs_start_transaction(root, 6);
4123         if (IS_ERR(trans))
4124                 return ERR_CAST(trans);
4125
4126         err = btrfs_find_free_objectid(root, &objectid);
4127         if (err)
4128                 goto out;
4129
4130         err = __insert_orphan_inode(trans, root, objectid);
4131         BUG_ON(err);
4132
4133         key.objectid = objectid;
4134         key.type = BTRFS_INODE_ITEM_KEY;
4135         key.offset = 0;
4136         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4137         BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4138         BTRFS_I(inode)->index_cnt = group->key.objectid;
4139
4140         err = btrfs_orphan_add(trans, BTRFS_I(inode));
4141 out:
4142         btrfs_end_transaction(trans);
4143         btrfs_btree_balance_dirty(fs_info);
4144         if (err) {
4145                 if (inode)
4146                         iput(inode);
4147                 inode = ERR_PTR(err);
4148         }
4149         return inode;
4150 }
4151
4152 static struct reloc_control *alloc_reloc_control(void)
4153 {
4154         struct reloc_control *rc;
4155
4156         rc = kzalloc(sizeof(*rc), GFP_NOFS);
4157         if (!rc)
4158                 return NULL;
4159
4160         INIT_LIST_HEAD(&rc->reloc_roots);
4161         backref_cache_init(&rc->backref_cache);
4162         mapping_tree_init(&rc->reloc_root_tree);
4163         extent_io_tree_init(&rc->processed_blocks, NULL);
4164         return rc;
4165 }
4166
4167 /*
4168  * Print the block group being relocated
4169  */
4170 static void describe_relocation(struct btrfs_fs_info *fs_info,
4171                                 struct btrfs_block_group_cache *block_group)
4172 {
4173         char buf[128];          /* prefixed by a '|' that'll be dropped */
4174         u64 flags = block_group->flags;
4175
4176         /* Shouldn't happen */
4177         if (!flags) {
4178                 strcpy(buf, "|NONE");
4179         } else {
4180                 char *bp = buf;
4181
4182 #define DESCRIBE_FLAG(f, d) \
4183                 if (flags & BTRFS_BLOCK_GROUP_##f) { \
4184                         bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
4185                         flags &= ~BTRFS_BLOCK_GROUP_##f; \
4186                 }
4187                 DESCRIBE_FLAG(DATA,     "data");
4188                 DESCRIBE_FLAG(SYSTEM,   "system");
4189                 DESCRIBE_FLAG(METADATA, "metadata");
4190                 DESCRIBE_FLAG(RAID0,    "raid0");
4191                 DESCRIBE_FLAG(RAID1,    "raid1");
4192                 DESCRIBE_FLAG(DUP,      "dup");
4193                 DESCRIBE_FLAG(RAID10,   "raid10");
4194                 DESCRIBE_FLAG(RAID5,    "raid5");
4195                 DESCRIBE_FLAG(RAID6,    "raid6");
4196                 if (flags)
4197                         snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
4198 #undef DESCRIBE_FLAG
4199         }
4200
4201         btrfs_info(fs_info,
4202                    "relocating block group %llu flags %s",
4203                    block_group->key.objectid, buf + 1);
4204 }
4205
4206 /*
4207  * function to relocate all extents in a block group.
4208  */
4209 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4210 {
4211         struct btrfs_root *extent_root = fs_info->extent_root;
4212         struct reloc_control *rc;
4213         struct inode *inode;
4214         struct btrfs_path *path;
4215         int ret;
4216         int rw = 0;
4217         int err = 0;
4218
4219         rc = alloc_reloc_control();
4220         if (!rc)
4221                 return -ENOMEM;
4222
4223         rc->extent_root = extent_root;
4224
4225         rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4226         BUG_ON(!rc->block_group);
4227
4228         ret = btrfs_inc_block_group_ro(rc->block_group);
4229         if (ret) {
4230                 err = ret;
4231                 goto out;
4232         }
4233         rw = 1;
4234
4235         path = btrfs_alloc_path();
4236         if (!path) {
4237                 err = -ENOMEM;
4238                 goto out;
4239         }
4240
4241         inode = lookup_free_space_inode(fs_info, rc->block_group, path);
4242         btrfs_free_path(path);
4243
4244         if (!IS_ERR(inode))
4245                 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4246         else
4247                 ret = PTR_ERR(inode);
4248
4249         if (ret && ret != -ENOENT) {
4250                 err = ret;
4251                 goto out;
4252         }
4253
4254         rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4255         if (IS_ERR(rc->data_inode)) {
4256                 err = PTR_ERR(rc->data_inode);
4257                 rc->data_inode = NULL;
4258                 goto out;
4259         }
4260
4261         describe_relocation(fs_info, rc->block_group);
4262
4263         btrfs_wait_block_group_reservations(rc->block_group);
4264         btrfs_wait_nocow_writers(rc->block_group);
4265         btrfs_wait_ordered_roots(fs_info, U64_MAX,
4266                                  rc->block_group->key.objectid,
4267                                  rc->block_group->key.offset);
4268
4269         while (1) {
4270                 mutex_lock(&fs_info->cleaner_mutex);
4271                 ret = relocate_block_group(rc);
4272                 mutex_unlock(&fs_info->cleaner_mutex);
4273                 if (ret < 0) {
4274                         err = ret;
4275                         goto out;
4276                 }
4277
4278                 if (rc->extents_found == 0)
4279                         break;
4280
4281                 btrfs_info(fs_info, "found %llu extents", rc->extents_found);
4282
4283                 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4284                         ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4285                                                        (u64)-1);
4286                         if (ret) {
4287                                 err = ret;
4288                                 goto out;
4289                         }
4290                         invalidate_mapping_pages(rc->data_inode->i_mapping,
4291                                                  0, -1);
4292                         rc->stage = UPDATE_DATA_PTRS;
4293                 }
4294         }
4295
4296         WARN_ON(rc->block_group->pinned > 0);
4297         WARN_ON(rc->block_group->reserved > 0);
4298         WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4299 out:
4300         if (err && rw)
4301                 btrfs_dec_block_group_ro(rc->block_group);
4302         iput(rc->data_inode);
4303         btrfs_put_block_group(rc->block_group);
4304         kfree(rc);
4305         return err;
4306 }
4307
4308 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4309 {
4310         struct btrfs_fs_info *fs_info = root->fs_info;
4311         struct btrfs_trans_handle *trans;
4312         int ret, err;
4313
4314         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4315         if (IS_ERR(trans))
4316                 return PTR_ERR(trans);
4317
4318         memset(&root->root_item.drop_progress, 0,
4319                 sizeof(root->root_item.drop_progress));
4320         root->root_item.drop_level = 0;
4321         btrfs_set_root_refs(&root->root_item, 0);
4322         ret = btrfs_update_root(trans, fs_info->tree_root,
4323                                 &root->root_key, &root->root_item);
4324
4325         err = btrfs_end_transaction(trans);
4326         if (err)
4327                 return err;
4328         return ret;
4329 }
4330
4331 /*
4332  * recover relocation interrupted by system crash.
4333  *
4334  * this function resumes merging reloc trees with corresponding fs trees.
4335  * this is important for keeping the sharing of tree blocks
4336  */
4337 int btrfs_recover_relocation(struct btrfs_root *root)
4338 {
4339         struct btrfs_fs_info *fs_info = root->fs_info;
4340         LIST_HEAD(reloc_roots);
4341         struct btrfs_key key;
4342         struct btrfs_root *fs_root;
4343         struct btrfs_root *reloc_root;
4344         struct btrfs_path *path;
4345         struct extent_buffer *leaf;
4346         struct reloc_control *rc = NULL;
4347         struct btrfs_trans_handle *trans;
4348         int ret;
4349         int err = 0;
4350
4351         path = btrfs_alloc_path();
4352         if (!path)
4353                 return -ENOMEM;
4354         path->reada = READA_BACK;
4355
4356         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4357         key.type = BTRFS_ROOT_ITEM_KEY;
4358         key.offset = (u64)-1;
4359
4360         while (1) {
4361                 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4362                                         path, 0, 0);
4363                 if (ret < 0) {
4364                         err = ret;
4365                         goto out;
4366                 }
4367                 if (ret > 0) {
4368                         if (path->slots[0] == 0)
4369                                 break;
4370                         path->slots[0]--;
4371                 }
4372                 leaf = path->nodes[0];
4373                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4374                 btrfs_release_path(path);
4375
4376                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4377                     key.type != BTRFS_ROOT_ITEM_KEY)
4378                         break;
4379
4380                 reloc_root = btrfs_read_fs_root(root, &key);
4381                 if (IS_ERR(reloc_root)) {
4382                         err = PTR_ERR(reloc_root);
4383                         goto out;
4384                 }
4385
4386                 list_add(&reloc_root->root_list, &reloc_roots);
4387
4388                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4389                         fs_root = read_fs_root(fs_info,
4390                                                reloc_root->root_key.offset);
4391                         if (IS_ERR(fs_root)) {
4392                                 ret = PTR_ERR(fs_root);
4393                                 if (ret != -ENOENT) {
4394                                         err = ret;
4395                                         goto out;
4396                                 }
4397                                 ret = mark_garbage_root(reloc_root);
4398                                 if (ret < 0) {
4399                                         err = ret;
4400                                         goto out;
4401                                 }
4402                         }
4403                 }
4404
4405                 if (key.offset == 0)
4406                         break;
4407
4408                 key.offset--;
4409         }
4410         btrfs_release_path(path);
4411
4412         if (list_empty(&reloc_roots))
4413                 goto out;
4414
4415         rc = alloc_reloc_control();
4416         if (!rc) {
4417                 err = -ENOMEM;
4418                 goto out;
4419         }
4420
4421         rc->extent_root = fs_info->extent_root;
4422
4423         set_reloc_control(rc);
4424
4425         trans = btrfs_join_transaction(rc->extent_root);
4426         if (IS_ERR(trans)) {
4427                 unset_reloc_control(rc);
4428                 err = PTR_ERR(trans);
4429                 goto out_free;
4430         }
4431
4432         rc->merge_reloc_tree = 1;
4433
4434         while (!list_empty(&reloc_roots)) {
4435                 reloc_root = list_entry(reloc_roots.next,
4436                                         struct btrfs_root, root_list);
4437                 list_del(&reloc_root->root_list);
4438
4439                 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4440                         list_add_tail(&reloc_root->root_list,
4441                                       &rc->reloc_roots);
4442                         continue;
4443                 }
4444
4445                 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
4446                 if (IS_ERR(fs_root)) {
4447                         err = PTR_ERR(fs_root);
4448                         goto out_free;
4449                 }
4450
4451                 err = __add_reloc_root(reloc_root);
4452                 BUG_ON(err < 0); /* -ENOMEM or logic error */
4453                 fs_root->reloc_root = reloc_root;
4454         }
4455
4456         err = btrfs_commit_transaction(trans);
4457         if (err)
4458                 goto out_free;
4459
4460         merge_reloc_roots(rc);
4461
4462         unset_reloc_control(rc);
4463
4464         trans = btrfs_join_transaction(rc->extent_root);
4465         if (IS_ERR(trans)) {
4466                 err = PTR_ERR(trans);
4467                 goto out_free;
4468         }
4469         err = btrfs_commit_transaction(trans);
4470 out_free:
4471         kfree(rc);
4472 out:
4473         if (!list_empty(&reloc_roots))
4474                 free_reloc_roots(&reloc_roots);
4475
4476         btrfs_free_path(path);
4477
4478         if (err == 0) {
4479                 /* cleanup orphan inode in data relocation tree */
4480                 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4481                 if (IS_ERR(fs_root))
4482                         err = PTR_ERR(fs_root);
4483                 else
4484                         err = btrfs_orphan_cleanup(fs_root);
4485         }
4486         return err;
4487 }
4488
4489 /*
4490  * helper to add ordered checksum for data relocation.
4491  *
4492  * cloning checksum properly handles the nodatasum extents.
4493  * it also saves CPU time to re-calculate the checksum.
4494  */
4495 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4496 {
4497         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4498         struct btrfs_ordered_sum *sums;
4499         struct btrfs_ordered_extent *ordered;
4500         int ret;
4501         u64 disk_bytenr;
4502         u64 new_bytenr;
4503         LIST_HEAD(list);
4504
4505         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4506         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4507
4508         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4509         ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4510                                        disk_bytenr + len - 1, &list, 0);
4511         if (ret)
4512                 goto out;
4513
4514         while (!list_empty(&list)) {
4515                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4516                 list_del_init(&sums->list);
4517
4518                 /*
4519                  * We need to offset the new_bytenr based on where the csum is.
4520                  * We need to do this because we will read in entire prealloc
4521                  * extents but we may have written to say the middle of the
4522                  * prealloc extent, so we need to make sure the csum goes with
4523                  * the right disk offset.
4524                  *
4525                  * We can do this because the data reloc inode refers strictly
4526                  * to the on disk bytes, so we don't have to worry about
4527                  * disk_len vs real len like with real inodes since it's all
4528                  * disk length.
4529                  */
4530                 new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4531                 sums->bytenr = new_bytenr;
4532
4533                 btrfs_add_ordered_sum(inode, ordered, sums);
4534         }
4535 out:
4536         btrfs_put_ordered_extent(ordered);
4537         return ret;
4538 }
4539
4540 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4541                           struct btrfs_root *root, struct extent_buffer *buf,
4542                           struct extent_buffer *cow)
4543 {
4544         struct btrfs_fs_info *fs_info = root->fs_info;
4545         struct reloc_control *rc;
4546         struct backref_node *node;
4547         int first_cow = 0;
4548         int level;
4549         int ret = 0;
4550
4551         rc = fs_info->reloc_ctl;
4552         if (!rc)
4553                 return 0;
4554
4555         BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4556                root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4557
4558         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4559                 if (buf == root->node)
4560                         __update_reloc_root(root, cow->start);
4561         }
4562
4563         level = btrfs_header_level(buf);
4564         if (btrfs_header_generation(buf) <=
4565             btrfs_root_last_snapshot(&root->root_item))
4566                 first_cow = 1;
4567
4568         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4569             rc->create_reloc_tree) {
4570                 WARN_ON(!first_cow && level == 0);
4571
4572                 node = rc->backref_cache.path[level];
4573                 BUG_ON(node->bytenr != buf->start &&
4574                        node->new_bytenr != buf->start);
4575
4576                 drop_node_buffer(node);
4577                 extent_buffer_get(cow);
4578                 node->eb = cow;
4579                 node->new_bytenr = cow->start;
4580
4581                 if (!node->pending) {
4582                         list_move_tail(&node->list,
4583                                        &rc->backref_cache.pending[level]);
4584                         node->pending = 1;
4585                 }
4586
4587                 if (first_cow)
4588                         __mark_block_processed(rc, node);
4589
4590                 if (first_cow && level > 0)
4591                         rc->nodes_relocated += buf->len;
4592         }
4593
4594         if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4595                 ret = replace_file_extents(trans, rc, root, cow);
4596         return ret;
4597 }
4598
4599 /*
4600  * called before creating snapshot. it calculates metadata reservation
4601  * required for relocating tree blocks in the snapshot
4602  */
4603 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4604                               u64 *bytes_to_reserve)
4605 {
4606         struct btrfs_root *root;
4607         struct reloc_control *rc;
4608
4609         root = pending->root;
4610         if (!root->reloc_root)
4611                 return;
4612
4613         rc = root->fs_info->reloc_ctl;
4614         if (!rc->merge_reloc_tree)
4615                 return;
4616
4617         root = root->reloc_root;
4618         BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4619         /*
4620          * relocation is in the stage of merging trees. the space
4621          * used by merging a reloc tree is twice the size of
4622          * relocated tree nodes in the worst case. half for cowing
4623          * the reloc tree, half for cowing the fs tree. the space
4624          * used by cowing the reloc tree will be freed after the
4625          * tree is dropped. if we create snapshot, cowing the fs
4626          * tree may use more space than it frees. so we need
4627          * reserve extra space.
4628          */
4629         *bytes_to_reserve += rc->nodes_relocated;
4630 }
4631
4632 /*
4633  * called after snapshot is created. migrate block reservation
4634  * and create reloc root for the newly created snapshot
4635  */
4636 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4637                                struct btrfs_pending_snapshot *pending)
4638 {
4639         struct btrfs_root *root = pending->root;
4640         struct btrfs_root *reloc_root;
4641         struct btrfs_root *new_root;
4642         struct reloc_control *rc;
4643         int ret;
4644
4645         if (!root->reloc_root)
4646                 return 0;
4647
4648         rc = root->fs_info->reloc_ctl;
4649         rc->merging_rsv_size += rc->nodes_relocated;
4650
4651         if (rc->merge_reloc_tree) {
4652                 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4653                                               rc->block_rsv,
4654                                               rc->nodes_relocated, 1);
4655                 if (ret)
4656                         return ret;
4657         }
4658
4659         new_root = pending->snap;
4660         reloc_root = create_reloc_root(trans, root->reloc_root,
4661                                        new_root->root_key.objectid);
4662         if (IS_ERR(reloc_root))
4663                 return PTR_ERR(reloc_root);
4664
4665         ret = __add_reloc_root(reloc_root);
4666         BUG_ON(ret < 0);
4667         new_root->reloc_root = reloc_root;
4668
4669         if (rc->create_reloc_tree)
4670                 ret = clone_backref_node(trans, rc, root, reloc_root);
4671         return ret;
4672 }