Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64...
[sfrench/cifs-2.6.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15
16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
17                       *root, struct btrfs_path *path, int level);
18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
19                       const struct btrfs_key *ins_key, struct btrfs_path *path,
20                       int data_size, int extend);
21 static int push_node_left(struct btrfs_trans_handle *trans,
22                           struct btrfs_fs_info *fs_info,
23                           struct extent_buffer *dst,
24                           struct extent_buffer *src, int empty);
25 static int balance_node_right(struct btrfs_trans_handle *trans,
26                               struct btrfs_fs_info *fs_info,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 struct btrfs_path *btrfs_alloc_path(void)
33 {
34         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35 }
36
37 /*
38  * set all locked nodes in the path to blocking locks.  This should
39  * be done before scheduling
40  */
41 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42 {
43         int i;
44         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45                 if (!p->nodes[i] || !p->locks[i])
46                         continue;
47                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48                 if (p->locks[i] == BTRFS_READ_LOCK)
49                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
51                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52         }
53 }
54
55 /* this also releases the path */
56 void btrfs_free_path(struct btrfs_path *p)
57 {
58         if (!p)
59                 return;
60         btrfs_release_path(p);
61         kmem_cache_free(btrfs_path_cachep, p);
62 }
63
64 /*
65  * path release drops references on the extent buffers in the path
66  * and it drops any locks held by this path
67  *
68  * It is safe to call this on paths that no locks or extent buffers held.
69  */
70 noinline void btrfs_release_path(struct btrfs_path *p)
71 {
72         int i;
73
74         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75                 p->slots[i] = 0;
76                 if (!p->nodes[i])
77                         continue;
78                 if (p->locks[i]) {
79                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
80                         p->locks[i] = 0;
81                 }
82                 free_extent_buffer(p->nodes[i]);
83                 p->nodes[i] = NULL;
84         }
85 }
86
87 /*
88  * safely gets a reference on the root node of a tree.  A lock
89  * is not taken, so a concurrent writer may put a different node
90  * at the root of the tree.  See btrfs_lock_root_node for the
91  * looping required.
92  *
93  * The extent buffer returned by this has a reference taken, so
94  * it won't disappear.  It may stop being the root of the tree
95  * at any time because there are no locks held.
96  */
97 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98 {
99         struct extent_buffer *eb;
100
101         while (1) {
102                 rcu_read_lock();
103                 eb = rcu_dereference(root->node);
104
105                 /*
106                  * RCU really hurts here, we could free up the root node because
107                  * it was COWed but we may not get the new root node yet so do
108                  * the inc_not_zero dance and if it doesn't work then
109                  * synchronize_rcu and try again.
110                  */
111                 if (atomic_inc_not_zero(&eb->refs)) {
112                         rcu_read_unlock();
113                         break;
114                 }
115                 rcu_read_unlock();
116                 synchronize_rcu();
117         }
118         return eb;
119 }
120
121 /* loop around taking references on and locking the root node of the
122  * tree until you end up with a lock on the root.  A locked buffer
123  * is returned, with a reference held.
124  */
125 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
126 {
127         struct extent_buffer *eb;
128
129         while (1) {
130                 eb = btrfs_root_node(root);
131                 btrfs_tree_lock(eb);
132                 if (eb == root->node)
133                         break;
134                 btrfs_tree_unlock(eb);
135                 free_extent_buffer(eb);
136         }
137         return eb;
138 }
139
140 /* loop around taking references on and locking the root node of the
141  * tree until you end up with a lock on the root.  A locked buffer
142  * is returned, with a reference held.
143  */
144 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
145 {
146         struct extent_buffer *eb;
147
148         while (1) {
149                 eb = btrfs_root_node(root);
150                 btrfs_tree_read_lock(eb);
151                 if (eb == root->node)
152                         break;
153                 btrfs_tree_read_unlock(eb);
154                 free_extent_buffer(eb);
155         }
156         return eb;
157 }
158
159 /* cowonly root (everything not a reference counted cow subvolume), just get
160  * put onto a simple dirty list.  transaction.c walks this to make sure they
161  * get properly updated on disk.
162  */
163 static void add_root_to_dirty_list(struct btrfs_root *root)
164 {
165         struct btrfs_fs_info *fs_info = root->fs_info;
166
167         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
168             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
169                 return;
170
171         spin_lock(&fs_info->trans_lock);
172         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
173                 /* Want the extent tree to be the last on the list */
174                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
175                         list_move_tail(&root->dirty_list,
176                                        &fs_info->dirty_cowonly_roots);
177                 else
178                         list_move(&root->dirty_list,
179                                   &fs_info->dirty_cowonly_roots);
180         }
181         spin_unlock(&fs_info->trans_lock);
182 }
183
184 /*
185  * used by snapshot creation to make a copy of a root for a tree with
186  * a given objectid.  The buffer with the new root node is returned in
187  * cow_ret, and this func returns zero on success or a negative error code.
188  */
189 int btrfs_copy_root(struct btrfs_trans_handle *trans,
190                       struct btrfs_root *root,
191                       struct extent_buffer *buf,
192                       struct extent_buffer **cow_ret, u64 new_root_objectid)
193 {
194         struct btrfs_fs_info *fs_info = root->fs_info;
195         struct extent_buffer *cow;
196         int ret = 0;
197         int level;
198         struct btrfs_disk_key disk_key;
199
200         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
201                 trans->transid != fs_info->running_transaction->transid);
202         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
203                 trans->transid != root->last_trans);
204
205         level = btrfs_header_level(buf);
206         if (level == 0)
207                 btrfs_item_key(buf, &disk_key, 0);
208         else
209                 btrfs_node_key(buf, &disk_key, 0);
210
211         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
212                         &disk_key, level, buf->start, 0);
213         if (IS_ERR(cow))
214                 return PTR_ERR(cow);
215
216         copy_extent_buffer_full(cow, buf);
217         btrfs_set_header_bytenr(cow, cow->start);
218         btrfs_set_header_generation(cow, trans->transid);
219         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
220         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
221                                      BTRFS_HEADER_FLAG_RELOC);
222         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
223                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
224         else
225                 btrfs_set_header_owner(cow, new_root_objectid);
226
227         write_extent_buffer_fsid(cow, fs_info->fsid);
228
229         WARN_ON(btrfs_header_generation(buf) > trans->transid);
230         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
231                 ret = btrfs_inc_ref(trans, root, cow, 1);
232         else
233                 ret = btrfs_inc_ref(trans, root, cow, 0);
234
235         if (ret)
236                 return ret;
237
238         btrfs_mark_buffer_dirty(cow);
239         *cow_ret = cow;
240         return 0;
241 }
242
243 enum mod_log_op {
244         MOD_LOG_KEY_REPLACE,
245         MOD_LOG_KEY_ADD,
246         MOD_LOG_KEY_REMOVE,
247         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
248         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
249         MOD_LOG_MOVE_KEYS,
250         MOD_LOG_ROOT_REPLACE,
251 };
252
253 struct tree_mod_root {
254         u64 logical;
255         u8 level;
256 };
257
258 struct tree_mod_elem {
259         struct rb_node node;
260         u64 logical;
261         u64 seq;
262         enum mod_log_op op;
263
264         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
265         int slot;
266
267         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
268         u64 generation;
269
270         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
271         struct btrfs_disk_key key;
272         u64 blockptr;
273
274         /* this is used for op == MOD_LOG_MOVE_KEYS */
275         struct {
276                 int dst_slot;
277                 int nr_items;
278         } move;
279
280         /* this is used for op == MOD_LOG_ROOT_REPLACE */
281         struct tree_mod_root old_root;
282 };
283
284 /*
285  * Pull a new tree mod seq number for our operation.
286  */
287 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
288 {
289         return atomic64_inc_return(&fs_info->tree_mod_seq);
290 }
291
292 /*
293  * This adds a new blocker to the tree mod log's blocker list if the @elem
294  * passed does not already have a sequence number set. So when a caller expects
295  * to record tree modifications, it should ensure to set elem->seq to zero
296  * before calling btrfs_get_tree_mod_seq.
297  * Returns a fresh, unused tree log modification sequence number, even if no new
298  * blocker was added.
299  */
300 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
301                            struct seq_list *elem)
302 {
303         write_lock(&fs_info->tree_mod_log_lock);
304         spin_lock(&fs_info->tree_mod_seq_lock);
305         if (!elem->seq) {
306                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
307                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
308         }
309         spin_unlock(&fs_info->tree_mod_seq_lock);
310         write_unlock(&fs_info->tree_mod_log_lock);
311
312         return elem->seq;
313 }
314
315 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
316                             struct seq_list *elem)
317 {
318         struct rb_root *tm_root;
319         struct rb_node *node;
320         struct rb_node *next;
321         struct seq_list *cur_elem;
322         struct tree_mod_elem *tm;
323         u64 min_seq = (u64)-1;
324         u64 seq_putting = elem->seq;
325
326         if (!seq_putting)
327                 return;
328
329         spin_lock(&fs_info->tree_mod_seq_lock);
330         list_del(&elem->list);
331         elem->seq = 0;
332
333         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
334                 if (cur_elem->seq < min_seq) {
335                         if (seq_putting > cur_elem->seq) {
336                                 /*
337                                  * blocker with lower sequence number exists, we
338                                  * cannot remove anything from the log
339                                  */
340                                 spin_unlock(&fs_info->tree_mod_seq_lock);
341                                 return;
342                         }
343                         min_seq = cur_elem->seq;
344                 }
345         }
346         spin_unlock(&fs_info->tree_mod_seq_lock);
347
348         /*
349          * anything that's lower than the lowest existing (read: blocked)
350          * sequence number can be removed from the tree.
351          */
352         write_lock(&fs_info->tree_mod_log_lock);
353         tm_root = &fs_info->tree_mod_log;
354         for (node = rb_first(tm_root); node; node = next) {
355                 next = rb_next(node);
356                 tm = rb_entry(node, struct tree_mod_elem, node);
357                 if (tm->seq > min_seq)
358                         continue;
359                 rb_erase(node, tm_root);
360                 kfree(tm);
361         }
362         write_unlock(&fs_info->tree_mod_log_lock);
363 }
364
365 /*
366  * key order of the log:
367  *       node/leaf start address -> sequence
368  *
369  * The 'start address' is the logical address of the *new* root node
370  * for root replace operations, or the logical address of the affected
371  * block for all other operations.
372  *
373  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
374  */
375 static noinline int
376 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
377 {
378         struct rb_root *tm_root;
379         struct rb_node **new;
380         struct rb_node *parent = NULL;
381         struct tree_mod_elem *cur;
382
383         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
384
385         tm_root = &fs_info->tree_mod_log;
386         new = &tm_root->rb_node;
387         while (*new) {
388                 cur = rb_entry(*new, struct tree_mod_elem, node);
389                 parent = *new;
390                 if (cur->logical < tm->logical)
391                         new = &((*new)->rb_left);
392                 else if (cur->logical > tm->logical)
393                         new = &((*new)->rb_right);
394                 else if (cur->seq < tm->seq)
395                         new = &((*new)->rb_left);
396                 else if (cur->seq > tm->seq)
397                         new = &((*new)->rb_right);
398                 else
399                         return -EEXIST;
400         }
401
402         rb_link_node(&tm->node, parent, new);
403         rb_insert_color(&tm->node, tm_root);
404         return 0;
405 }
406
407 /*
408  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
409  * returns zero with the tree_mod_log_lock acquired. The caller must hold
410  * this until all tree mod log insertions are recorded in the rb tree and then
411  * write unlock fs_info::tree_mod_log_lock.
412  */
413 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
414                                     struct extent_buffer *eb) {
415         smp_mb();
416         if (list_empty(&(fs_info)->tree_mod_seq_list))
417                 return 1;
418         if (eb && btrfs_header_level(eb) == 0)
419                 return 1;
420
421         write_lock(&fs_info->tree_mod_log_lock);
422         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
423                 write_unlock(&fs_info->tree_mod_log_lock);
424                 return 1;
425         }
426
427         return 0;
428 }
429
430 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
431 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
432                                     struct extent_buffer *eb)
433 {
434         smp_mb();
435         if (list_empty(&(fs_info)->tree_mod_seq_list))
436                 return 0;
437         if (eb && btrfs_header_level(eb) == 0)
438                 return 0;
439
440         return 1;
441 }
442
443 static struct tree_mod_elem *
444 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
445                     enum mod_log_op op, gfp_t flags)
446 {
447         struct tree_mod_elem *tm;
448
449         tm = kzalloc(sizeof(*tm), flags);
450         if (!tm)
451                 return NULL;
452
453         tm->logical = eb->start;
454         if (op != MOD_LOG_KEY_ADD) {
455                 btrfs_node_key(eb, &tm->key, slot);
456                 tm->blockptr = btrfs_node_blockptr(eb, slot);
457         }
458         tm->op = op;
459         tm->slot = slot;
460         tm->generation = btrfs_node_ptr_generation(eb, slot);
461         RB_CLEAR_NODE(&tm->node);
462
463         return tm;
464 }
465
466 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
467                 enum mod_log_op op, gfp_t flags)
468 {
469         struct tree_mod_elem *tm;
470         int ret;
471
472         if (!tree_mod_need_log(eb->fs_info, eb))
473                 return 0;
474
475         tm = alloc_tree_mod_elem(eb, slot, op, flags);
476         if (!tm)
477                 return -ENOMEM;
478
479         if (tree_mod_dont_log(eb->fs_info, eb)) {
480                 kfree(tm);
481                 return 0;
482         }
483
484         ret = __tree_mod_log_insert(eb->fs_info, tm);
485         write_unlock(&eb->fs_info->tree_mod_log_lock);
486         if (ret)
487                 kfree(tm);
488
489         return ret;
490 }
491
492 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
493                 int dst_slot, int src_slot, int nr_items)
494 {
495         struct tree_mod_elem *tm = NULL;
496         struct tree_mod_elem **tm_list = NULL;
497         int ret = 0;
498         int i;
499         int locked = 0;
500
501         if (!tree_mod_need_log(eb->fs_info, eb))
502                 return 0;
503
504         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
505         if (!tm_list)
506                 return -ENOMEM;
507
508         tm = kzalloc(sizeof(*tm), GFP_NOFS);
509         if (!tm) {
510                 ret = -ENOMEM;
511                 goto free_tms;
512         }
513
514         tm->logical = eb->start;
515         tm->slot = src_slot;
516         tm->move.dst_slot = dst_slot;
517         tm->move.nr_items = nr_items;
518         tm->op = MOD_LOG_MOVE_KEYS;
519
520         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
521                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
522                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
523                 if (!tm_list[i]) {
524                         ret = -ENOMEM;
525                         goto free_tms;
526                 }
527         }
528
529         if (tree_mod_dont_log(eb->fs_info, eb))
530                 goto free_tms;
531         locked = 1;
532
533         /*
534          * When we override something during the move, we log these removals.
535          * This can only happen when we move towards the beginning of the
536          * buffer, i.e. dst_slot < src_slot.
537          */
538         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
539                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
540                 if (ret)
541                         goto free_tms;
542         }
543
544         ret = __tree_mod_log_insert(eb->fs_info, tm);
545         if (ret)
546                 goto free_tms;
547         write_unlock(&eb->fs_info->tree_mod_log_lock);
548         kfree(tm_list);
549
550         return 0;
551 free_tms:
552         for (i = 0; i < nr_items; i++) {
553                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
554                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
555                 kfree(tm_list[i]);
556         }
557         if (locked)
558                 write_unlock(&eb->fs_info->tree_mod_log_lock);
559         kfree(tm_list);
560         kfree(tm);
561
562         return ret;
563 }
564
565 static inline int
566 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
567                        struct tree_mod_elem **tm_list,
568                        int nritems)
569 {
570         int i, j;
571         int ret;
572
573         for (i = nritems - 1; i >= 0; i--) {
574                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
575                 if (ret) {
576                         for (j = nritems - 1; j > i; j--)
577                                 rb_erase(&tm_list[j]->node,
578                                          &fs_info->tree_mod_log);
579                         return ret;
580                 }
581         }
582
583         return 0;
584 }
585
586 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
587                          struct extent_buffer *new_root, int log_removal)
588 {
589         struct btrfs_fs_info *fs_info = old_root->fs_info;
590         struct tree_mod_elem *tm = NULL;
591         struct tree_mod_elem **tm_list = NULL;
592         int nritems = 0;
593         int ret = 0;
594         int i;
595
596         if (!tree_mod_need_log(fs_info, NULL))
597                 return 0;
598
599         if (log_removal && btrfs_header_level(old_root) > 0) {
600                 nritems = btrfs_header_nritems(old_root);
601                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
602                                   GFP_NOFS);
603                 if (!tm_list) {
604                         ret = -ENOMEM;
605                         goto free_tms;
606                 }
607                 for (i = 0; i < nritems; i++) {
608                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
609                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
610                         if (!tm_list[i]) {
611                                 ret = -ENOMEM;
612                                 goto free_tms;
613                         }
614                 }
615         }
616
617         tm = kzalloc(sizeof(*tm), GFP_NOFS);
618         if (!tm) {
619                 ret = -ENOMEM;
620                 goto free_tms;
621         }
622
623         tm->logical = new_root->start;
624         tm->old_root.logical = old_root->start;
625         tm->old_root.level = btrfs_header_level(old_root);
626         tm->generation = btrfs_header_generation(old_root);
627         tm->op = MOD_LOG_ROOT_REPLACE;
628
629         if (tree_mod_dont_log(fs_info, NULL))
630                 goto free_tms;
631
632         if (tm_list)
633                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
634         if (!ret)
635                 ret = __tree_mod_log_insert(fs_info, tm);
636
637         write_unlock(&fs_info->tree_mod_log_lock);
638         if (ret)
639                 goto free_tms;
640         kfree(tm_list);
641
642         return ret;
643
644 free_tms:
645         if (tm_list) {
646                 for (i = 0; i < nritems; i++)
647                         kfree(tm_list[i]);
648                 kfree(tm_list);
649         }
650         kfree(tm);
651
652         return ret;
653 }
654
655 static struct tree_mod_elem *
656 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
657                       int smallest)
658 {
659         struct rb_root *tm_root;
660         struct rb_node *node;
661         struct tree_mod_elem *cur = NULL;
662         struct tree_mod_elem *found = NULL;
663
664         read_lock(&fs_info->tree_mod_log_lock);
665         tm_root = &fs_info->tree_mod_log;
666         node = tm_root->rb_node;
667         while (node) {
668                 cur = rb_entry(node, struct tree_mod_elem, node);
669                 if (cur->logical < start) {
670                         node = node->rb_left;
671                 } else if (cur->logical > start) {
672                         node = node->rb_right;
673                 } else if (cur->seq < min_seq) {
674                         node = node->rb_left;
675                 } else if (!smallest) {
676                         /* we want the node with the highest seq */
677                         if (found)
678                                 BUG_ON(found->seq > cur->seq);
679                         found = cur;
680                         node = node->rb_left;
681                 } else if (cur->seq > min_seq) {
682                         /* we want the node with the smallest seq */
683                         if (found)
684                                 BUG_ON(found->seq < cur->seq);
685                         found = cur;
686                         node = node->rb_right;
687                 } else {
688                         found = cur;
689                         break;
690                 }
691         }
692         read_unlock(&fs_info->tree_mod_log_lock);
693
694         return found;
695 }
696
697 /*
698  * this returns the element from the log with the smallest time sequence
699  * value that's in the log (the oldest log item). any element with a time
700  * sequence lower than min_seq will be ignored.
701  */
702 static struct tree_mod_elem *
703 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
704                            u64 min_seq)
705 {
706         return __tree_mod_log_search(fs_info, start, min_seq, 1);
707 }
708
709 /*
710  * this returns the element from the log with the largest time sequence
711  * value that's in the log (the most recent log item). any element with
712  * a time sequence lower than min_seq will be ignored.
713  */
714 static struct tree_mod_elem *
715 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
716 {
717         return __tree_mod_log_search(fs_info, start, min_seq, 0);
718 }
719
720 static noinline int
721 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
722                      struct extent_buffer *src, unsigned long dst_offset,
723                      unsigned long src_offset, int nr_items)
724 {
725         int ret = 0;
726         struct tree_mod_elem **tm_list = NULL;
727         struct tree_mod_elem **tm_list_add, **tm_list_rem;
728         int i;
729         int locked = 0;
730
731         if (!tree_mod_need_log(fs_info, NULL))
732                 return 0;
733
734         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
735                 return 0;
736
737         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
738                           GFP_NOFS);
739         if (!tm_list)
740                 return -ENOMEM;
741
742         tm_list_add = tm_list;
743         tm_list_rem = tm_list + nr_items;
744         for (i = 0; i < nr_items; i++) {
745                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
746                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
747                 if (!tm_list_rem[i]) {
748                         ret = -ENOMEM;
749                         goto free_tms;
750                 }
751
752                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
753                     MOD_LOG_KEY_ADD, GFP_NOFS);
754                 if (!tm_list_add[i]) {
755                         ret = -ENOMEM;
756                         goto free_tms;
757                 }
758         }
759
760         if (tree_mod_dont_log(fs_info, NULL))
761                 goto free_tms;
762         locked = 1;
763
764         for (i = 0; i < nr_items; i++) {
765                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
766                 if (ret)
767                         goto free_tms;
768                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
769                 if (ret)
770                         goto free_tms;
771         }
772
773         write_unlock(&fs_info->tree_mod_log_lock);
774         kfree(tm_list);
775
776         return 0;
777
778 free_tms:
779         for (i = 0; i < nr_items * 2; i++) {
780                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
781                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
782                 kfree(tm_list[i]);
783         }
784         if (locked)
785                 write_unlock(&fs_info->tree_mod_log_lock);
786         kfree(tm_list);
787
788         return ret;
789 }
790
791 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
792 {
793         struct tree_mod_elem **tm_list = NULL;
794         int nritems = 0;
795         int i;
796         int ret = 0;
797
798         if (btrfs_header_level(eb) == 0)
799                 return 0;
800
801         if (!tree_mod_need_log(eb->fs_info, NULL))
802                 return 0;
803
804         nritems = btrfs_header_nritems(eb);
805         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
806         if (!tm_list)
807                 return -ENOMEM;
808
809         for (i = 0; i < nritems; i++) {
810                 tm_list[i] = alloc_tree_mod_elem(eb, i,
811                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
812                 if (!tm_list[i]) {
813                         ret = -ENOMEM;
814                         goto free_tms;
815                 }
816         }
817
818         if (tree_mod_dont_log(eb->fs_info, eb))
819                 goto free_tms;
820
821         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
822         write_unlock(&eb->fs_info->tree_mod_log_lock);
823         if (ret)
824                 goto free_tms;
825         kfree(tm_list);
826
827         return 0;
828
829 free_tms:
830         for (i = 0; i < nritems; i++)
831                 kfree(tm_list[i]);
832         kfree(tm_list);
833
834         return ret;
835 }
836
837 /*
838  * check if the tree block can be shared by multiple trees
839  */
840 int btrfs_block_can_be_shared(struct btrfs_root *root,
841                               struct extent_buffer *buf)
842 {
843         /*
844          * Tree blocks not in reference counted trees and tree roots
845          * are never shared. If a block was allocated after the last
846          * snapshot and the block was not allocated by tree relocation,
847          * we know the block is not shared.
848          */
849         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
850             buf != root->node && buf != root->commit_root &&
851             (btrfs_header_generation(buf) <=
852              btrfs_root_last_snapshot(&root->root_item) ||
853              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
854                 return 1;
855
856         return 0;
857 }
858
859 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
860                                        struct btrfs_root *root,
861                                        struct extent_buffer *buf,
862                                        struct extent_buffer *cow,
863                                        int *last_ref)
864 {
865         struct btrfs_fs_info *fs_info = root->fs_info;
866         u64 refs;
867         u64 owner;
868         u64 flags;
869         u64 new_flags = 0;
870         int ret;
871
872         /*
873          * Backrefs update rules:
874          *
875          * Always use full backrefs for extent pointers in tree block
876          * allocated by tree relocation.
877          *
878          * If a shared tree block is no longer referenced by its owner
879          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
880          * use full backrefs for extent pointers in tree block.
881          *
882          * If a tree block is been relocating
883          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
884          * use full backrefs for extent pointers in tree block.
885          * The reason for this is some operations (such as drop tree)
886          * are only allowed for blocks use full backrefs.
887          */
888
889         if (btrfs_block_can_be_shared(root, buf)) {
890                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
891                                                btrfs_header_level(buf), 1,
892                                                &refs, &flags);
893                 if (ret)
894                         return ret;
895                 if (refs == 0) {
896                         ret = -EROFS;
897                         btrfs_handle_fs_error(fs_info, ret, NULL);
898                         return ret;
899                 }
900         } else {
901                 refs = 1;
902                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
903                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
904                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
905                 else
906                         flags = 0;
907         }
908
909         owner = btrfs_header_owner(buf);
910         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
911                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
912
913         if (refs > 1) {
914                 if ((owner == root->root_key.objectid ||
915                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
916                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
917                         ret = btrfs_inc_ref(trans, root, buf, 1);
918                         if (ret)
919                                 return ret;
920
921                         if (root->root_key.objectid ==
922                             BTRFS_TREE_RELOC_OBJECTID) {
923                                 ret = btrfs_dec_ref(trans, root, buf, 0);
924                                 if (ret)
925                                         return ret;
926                                 ret = btrfs_inc_ref(trans, root, cow, 1);
927                                 if (ret)
928                                         return ret;
929                         }
930                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
931                 } else {
932
933                         if (root->root_key.objectid ==
934                             BTRFS_TREE_RELOC_OBJECTID)
935                                 ret = btrfs_inc_ref(trans, root, cow, 1);
936                         else
937                                 ret = btrfs_inc_ref(trans, root, cow, 0);
938                         if (ret)
939                                 return ret;
940                 }
941                 if (new_flags != 0) {
942                         int level = btrfs_header_level(buf);
943
944                         ret = btrfs_set_disk_extent_flags(trans, fs_info,
945                                                           buf->start,
946                                                           buf->len,
947                                                           new_flags, level, 0);
948                         if (ret)
949                                 return ret;
950                 }
951         } else {
952                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
953                         if (root->root_key.objectid ==
954                             BTRFS_TREE_RELOC_OBJECTID)
955                                 ret = btrfs_inc_ref(trans, root, cow, 1);
956                         else
957                                 ret = btrfs_inc_ref(trans, root, cow, 0);
958                         if (ret)
959                                 return ret;
960                         ret = btrfs_dec_ref(trans, root, buf, 1);
961                         if (ret)
962                                 return ret;
963                 }
964                 clean_tree_block(fs_info, buf);
965                 *last_ref = 1;
966         }
967         return 0;
968 }
969
970 /*
971  * does the dirty work in cow of a single block.  The parent block (if
972  * supplied) is updated to point to the new cow copy.  The new buffer is marked
973  * dirty and returned locked.  If you modify the block it needs to be marked
974  * dirty again.
975  *
976  * search_start -- an allocation hint for the new block
977  *
978  * empty_size -- a hint that you plan on doing more cow.  This is the size in
979  * bytes the allocator should try to find free next to the block it returns.
980  * This is just a hint and may be ignored by the allocator.
981  */
982 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
983                              struct btrfs_root *root,
984                              struct extent_buffer *buf,
985                              struct extent_buffer *parent, int parent_slot,
986                              struct extent_buffer **cow_ret,
987                              u64 search_start, u64 empty_size)
988 {
989         struct btrfs_fs_info *fs_info = root->fs_info;
990         struct btrfs_disk_key disk_key;
991         struct extent_buffer *cow;
992         int level, ret;
993         int last_ref = 0;
994         int unlock_orig = 0;
995         u64 parent_start = 0;
996
997         if (*cow_ret == buf)
998                 unlock_orig = 1;
999
1000         btrfs_assert_tree_locked(buf);
1001
1002         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1003                 trans->transid != fs_info->running_transaction->transid);
1004         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1005                 trans->transid != root->last_trans);
1006
1007         level = btrfs_header_level(buf);
1008
1009         if (level == 0)
1010                 btrfs_item_key(buf, &disk_key, 0);
1011         else
1012                 btrfs_node_key(buf, &disk_key, 0);
1013
1014         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1015                 parent_start = parent->start;
1016
1017         /*
1018          * If we are COWing a node/leaf from the extent, chunk or device trees,
1019          * make sure that we do not finish block group creation of pending block
1020          * groups. We do this to avoid a deadlock.
1021          * COWing can result in allocation of a new chunk, and flushing pending
1022          * block groups (btrfs_create_pending_block_groups()) can be triggered
1023          * when finishing allocation of a new chunk. Creation of a pending block
1024          * group modifies the extent, chunk and device trees, therefore we could
1025          * deadlock with ourselves since we are holding a lock on an extent
1026          * buffer that btrfs_create_pending_block_groups() may try to COW later.
1027          */
1028         if (root == fs_info->extent_root ||
1029             root == fs_info->chunk_root ||
1030             root == fs_info->dev_root)
1031                 trans->can_flush_pending_bgs = false;
1032
1033         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1034                         root->root_key.objectid, &disk_key, level,
1035                         search_start, empty_size);
1036         trans->can_flush_pending_bgs = true;
1037         if (IS_ERR(cow))
1038                 return PTR_ERR(cow);
1039
1040         /* cow is set to blocking by btrfs_init_new_buffer */
1041
1042         copy_extent_buffer_full(cow, buf);
1043         btrfs_set_header_bytenr(cow, cow->start);
1044         btrfs_set_header_generation(cow, trans->transid);
1045         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1046         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1047                                      BTRFS_HEADER_FLAG_RELOC);
1048         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1049                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1050         else
1051                 btrfs_set_header_owner(cow, root->root_key.objectid);
1052
1053         write_extent_buffer_fsid(cow, fs_info->fsid);
1054
1055         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1056         if (ret) {
1057                 btrfs_abort_transaction(trans, ret);
1058                 return ret;
1059         }
1060
1061         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1062                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1063                 if (ret) {
1064                         btrfs_abort_transaction(trans, ret);
1065                         return ret;
1066                 }
1067         }
1068
1069         if (buf == root->node) {
1070                 WARN_ON(parent && parent != buf);
1071                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1072                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1073                         parent_start = buf->start;
1074
1075                 extent_buffer_get(cow);
1076                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1077                 BUG_ON(ret < 0);
1078                 rcu_assign_pointer(root->node, cow);
1079
1080                 btrfs_free_tree_block(trans, root, buf, parent_start,
1081                                       last_ref);
1082                 free_extent_buffer(buf);
1083                 add_root_to_dirty_list(root);
1084         } else {
1085                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1086                 tree_mod_log_insert_key(parent, parent_slot,
1087                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1088                 btrfs_set_node_blockptr(parent, parent_slot,
1089                                         cow->start);
1090                 btrfs_set_node_ptr_generation(parent, parent_slot,
1091                                               trans->transid);
1092                 btrfs_mark_buffer_dirty(parent);
1093                 if (last_ref) {
1094                         ret = tree_mod_log_free_eb(buf);
1095                         if (ret) {
1096                                 btrfs_abort_transaction(trans, ret);
1097                                 return ret;
1098                         }
1099                 }
1100                 btrfs_free_tree_block(trans, root, buf, parent_start,
1101                                       last_ref);
1102         }
1103         if (unlock_orig)
1104                 btrfs_tree_unlock(buf);
1105         free_extent_buffer_stale(buf);
1106         btrfs_mark_buffer_dirty(cow);
1107         *cow_ret = cow;
1108         return 0;
1109 }
1110
1111 /*
1112  * returns the logical address of the oldest predecessor of the given root.
1113  * entries older than time_seq are ignored.
1114  */
1115 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1116                 struct extent_buffer *eb_root, u64 time_seq)
1117 {
1118         struct tree_mod_elem *tm;
1119         struct tree_mod_elem *found = NULL;
1120         u64 root_logical = eb_root->start;
1121         int looped = 0;
1122
1123         if (!time_seq)
1124                 return NULL;
1125
1126         /*
1127          * the very last operation that's logged for a root is the
1128          * replacement operation (if it is replaced at all). this has
1129          * the logical address of the *new* root, making it the very
1130          * first operation that's logged for this root.
1131          */
1132         while (1) {
1133                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1134                                                 time_seq);
1135                 if (!looped && !tm)
1136                         return NULL;
1137                 /*
1138                  * if there are no tree operation for the oldest root, we simply
1139                  * return it. this should only happen if that (old) root is at
1140                  * level 0.
1141                  */
1142                 if (!tm)
1143                         break;
1144
1145                 /*
1146                  * if there's an operation that's not a root replacement, we
1147                  * found the oldest version of our root. normally, we'll find a
1148                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1149                  */
1150                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1151                         break;
1152
1153                 found = tm;
1154                 root_logical = tm->old_root.logical;
1155                 looped = 1;
1156         }
1157
1158         /* if there's no old root to return, return what we found instead */
1159         if (!found)
1160                 found = tm;
1161
1162         return found;
1163 }
1164
1165 /*
1166  * tm is a pointer to the first operation to rewind within eb. then, all
1167  * previous operations will be rewound (until we reach something older than
1168  * time_seq).
1169  */
1170 static void
1171 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1172                       u64 time_seq, struct tree_mod_elem *first_tm)
1173 {
1174         u32 n;
1175         struct rb_node *next;
1176         struct tree_mod_elem *tm = first_tm;
1177         unsigned long o_dst;
1178         unsigned long o_src;
1179         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1180
1181         n = btrfs_header_nritems(eb);
1182         read_lock(&fs_info->tree_mod_log_lock);
1183         while (tm && tm->seq >= time_seq) {
1184                 /*
1185                  * all the operations are recorded with the operator used for
1186                  * the modification. as we're going backwards, we do the
1187                  * opposite of each operation here.
1188                  */
1189                 switch (tm->op) {
1190                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1191                         BUG_ON(tm->slot < n);
1192                         /* Fallthrough */
1193                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1194                 case MOD_LOG_KEY_REMOVE:
1195                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1196                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1197                         btrfs_set_node_ptr_generation(eb, tm->slot,
1198                                                       tm->generation);
1199                         n++;
1200                         break;
1201                 case MOD_LOG_KEY_REPLACE:
1202                         BUG_ON(tm->slot >= n);
1203                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1204                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1205                         btrfs_set_node_ptr_generation(eb, tm->slot,
1206                                                       tm->generation);
1207                         break;
1208                 case MOD_LOG_KEY_ADD:
1209                         /* if a move operation is needed it's in the log */
1210                         n--;
1211                         break;
1212                 case MOD_LOG_MOVE_KEYS:
1213                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1214                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1215                         memmove_extent_buffer(eb, o_dst, o_src,
1216                                               tm->move.nr_items * p_size);
1217                         break;
1218                 case MOD_LOG_ROOT_REPLACE:
1219                         /*
1220                          * this operation is special. for roots, this must be
1221                          * handled explicitly before rewinding.
1222                          * for non-roots, this operation may exist if the node
1223                          * was a root: root A -> child B; then A gets empty and
1224                          * B is promoted to the new root. in the mod log, we'll
1225                          * have a root-replace operation for B, a tree block
1226                          * that is no root. we simply ignore that operation.
1227                          */
1228                         break;
1229                 }
1230                 next = rb_next(&tm->node);
1231                 if (!next)
1232                         break;
1233                 tm = rb_entry(next, struct tree_mod_elem, node);
1234                 if (tm->logical != first_tm->logical)
1235                         break;
1236         }
1237         read_unlock(&fs_info->tree_mod_log_lock);
1238         btrfs_set_header_nritems(eb, n);
1239 }
1240
1241 /*
1242  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1243  * is returned. If rewind operations happen, a fresh buffer is returned. The
1244  * returned buffer is always read-locked. If the returned buffer is not the
1245  * input buffer, the lock on the input buffer is released and the input buffer
1246  * is freed (its refcount is decremented).
1247  */
1248 static struct extent_buffer *
1249 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1250                     struct extent_buffer *eb, u64 time_seq)
1251 {
1252         struct extent_buffer *eb_rewin;
1253         struct tree_mod_elem *tm;
1254
1255         if (!time_seq)
1256                 return eb;
1257
1258         if (btrfs_header_level(eb) == 0)
1259                 return eb;
1260
1261         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1262         if (!tm)
1263                 return eb;
1264
1265         btrfs_set_path_blocking(path);
1266         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1267
1268         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1269                 BUG_ON(tm->slot != 0);
1270                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1271                 if (!eb_rewin) {
1272                         btrfs_tree_read_unlock_blocking(eb);
1273                         free_extent_buffer(eb);
1274                         return NULL;
1275                 }
1276                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1277                 btrfs_set_header_backref_rev(eb_rewin,
1278                                              btrfs_header_backref_rev(eb));
1279                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1280                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1281         } else {
1282                 eb_rewin = btrfs_clone_extent_buffer(eb);
1283                 if (!eb_rewin) {
1284                         btrfs_tree_read_unlock_blocking(eb);
1285                         free_extent_buffer(eb);
1286                         return NULL;
1287                 }
1288         }
1289
1290         btrfs_tree_read_unlock_blocking(eb);
1291         free_extent_buffer(eb);
1292
1293         extent_buffer_get(eb_rewin);
1294         btrfs_tree_read_lock(eb_rewin);
1295         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1296         WARN_ON(btrfs_header_nritems(eb_rewin) >
1297                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1298
1299         return eb_rewin;
1300 }
1301
1302 /*
1303  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1304  * value. If there are no changes, the current root->root_node is returned. If
1305  * anything changed in between, there's a fresh buffer allocated on which the
1306  * rewind operations are done. In any case, the returned buffer is read locked.
1307  * Returns NULL on error (with no locks held).
1308  */
1309 static inline struct extent_buffer *
1310 get_old_root(struct btrfs_root *root, u64 time_seq)
1311 {
1312         struct btrfs_fs_info *fs_info = root->fs_info;
1313         struct tree_mod_elem *tm;
1314         struct extent_buffer *eb = NULL;
1315         struct extent_buffer *eb_root;
1316         struct extent_buffer *old;
1317         struct tree_mod_root *old_root = NULL;
1318         u64 old_generation = 0;
1319         u64 logical;
1320         int level;
1321
1322         eb_root = btrfs_read_lock_root_node(root);
1323         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1324         if (!tm)
1325                 return eb_root;
1326
1327         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1328                 old_root = &tm->old_root;
1329                 old_generation = tm->generation;
1330                 logical = old_root->logical;
1331                 level = old_root->level;
1332         } else {
1333                 logical = eb_root->start;
1334                 level = btrfs_header_level(eb_root);
1335         }
1336
1337         tm = tree_mod_log_search(fs_info, logical, time_seq);
1338         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1339                 btrfs_tree_read_unlock(eb_root);
1340                 free_extent_buffer(eb_root);
1341                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1342                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1343                         if (!IS_ERR(old))
1344                                 free_extent_buffer(old);
1345                         btrfs_warn(fs_info,
1346                                    "failed to read tree block %llu from get_old_root",
1347                                    logical);
1348                 } else {
1349                         eb = btrfs_clone_extent_buffer(old);
1350                         free_extent_buffer(old);
1351                 }
1352         } else if (old_root) {
1353                 btrfs_tree_read_unlock(eb_root);
1354                 free_extent_buffer(eb_root);
1355                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1356         } else {
1357                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1358                 eb = btrfs_clone_extent_buffer(eb_root);
1359                 btrfs_tree_read_unlock_blocking(eb_root);
1360                 free_extent_buffer(eb_root);
1361         }
1362
1363         if (!eb)
1364                 return NULL;
1365         extent_buffer_get(eb);
1366         btrfs_tree_read_lock(eb);
1367         if (old_root) {
1368                 btrfs_set_header_bytenr(eb, eb->start);
1369                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1370                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1371                 btrfs_set_header_level(eb, old_root->level);
1372                 btrfs_set_header_generation(eb, old_generation);
1373         }
1374         if (tm)
1375                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1376         else
1377                 WARN_ON(btrfs_header_level(eb) != 0);
1378         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1379
1380         return eb;
1381 }
1382
1383 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1384 {
1385         struct tree_mod_elem *tm;
1386         int level;
1387         struct extent_buffer *eb_root = btrfs_root_node(root);
1388
1389         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1390         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1391                 level = tm->old_root.level;
1392         } else {
1393                 level = btrfs_header_level(eb_root);
1394         }
1395         free_extent_buffer(eb_root);
1396
1397         return level;
1398 }
1399
1400 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1401                                    struct btrfs_root *root,
1402                                    struct extent_buffer *buf)
1403 {
1404         if (btrfs_is_testing(root->fs_info))
1405                 return 0;
1406
1407         /* Ensure we can see the FORCE_COW bit */
1408         smp_mb__before_atomic();
1409
1410         /*
1411          * We do not need to cow a block if
1412          * 1) this block is not created or changed in this transaction;
1413          * 2) this block does not belong to TREE_RELOC tree;
1414          * 3) the root is not forced COW.
1415          *
1416          * What is forced COW:
1417          *    when we create snapshot during committing the transaction,
1418          *    after we've finished coping src root, we must COW the shared
1419          *    block to ensure the metadata consistency.
1420          */
1421         if (btrfs_header_generation(buf) == trans->transid &&
1422             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1423             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1424               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1425             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1426                 return 0;
1427         return 1;
1428 }
1429
1430 /*
1431  * cows a single block, see __btrfs_cow_block for the real work.
1432  * This version of it has extra checks so that a block isn't COWed more than
1433  * once per transaction, as long as it hasn't been written yet
1434  */
1435 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1436                     struct btrfs_root *root, struct extent_buffer *buf,
1437                     struct extent_buffer *parent, int parent_slot,
1438                     struct extent_buffer **cow_ret)
1439 {
1440         struct btrfs_fs_info *fs_info = root->fs_info;
1441         u64 search_start;
1442         int ret;
1443
1444         if (trans->transaction != fs_info->running_transaction)
1445                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1446                        trans->transid,
1447                        fs_info->running_transaction->transid);
1448
1449         if (trans->transid != fs_info->generation)
1450                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1451                        trans->transid, fs_info->generation);
1452
1453         if (!should_cow_block(trans, root, buf)) {
1454                 trans->dirty = true;
1455                 *cow_ret = buf;
1456                 return 0;
1457         }
1458
1459         search_start = buf->start & ~((u64)SZ_1G - 1);
1460
1461         if (parent)
1462                 btrfs_set_lock_blocking(parent);
1463         btrfs_set_lock_blocking(buf);
1464
1465         ret = __btrfs_cow_block(trans, root, buf, parent,
1466                                  parent_slot, cow_ret, search_start, 0);
1467
1468         trace_btrfs_cow_block(root, buf, *cow_ret);
1469
1470         return ret;
1471 }
1472
1473 /*
1474  * helper function for defrag to decide if two blocks pointed to by a
1475  * node are actually close by
1476  */
1477 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1478 {
1479         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1480                 return 1;
1481         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1482                 return 1;
1483         return 0;
1484 }
1485
1486 /*
1487  * compare two keys in a memcmp fashion
1488  */
1489 static int comp_keys(const struct btrfs_disk_key *disk,
1490                      const struct btrfs_key *k2)
1491 {
1492         struct btrfs_key k1;
1493
1494         btrfs_disk_key_to_cpu(&k1, disk);
1495
1496         return btrfs_comp_cpu_keys(&k1, k2);
1497 }
1498
1499 /*
1500  * same as comp_keys only with two btrfs_key's
1501  */
1502 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1503 {
1504         if (k1->objectid > k2->objectid)
1505                 return 1;
1506         if (k1->objectid < k2->objectid)
1507                 return -1;
1508         if (k1->type > k2->type)
1509                 return 1;
1510         if (k1->type < k2->type)
1511                 return -1;
1512         if (k1->offset > k2->offset)
1513                 return 1;
1514         if (k1->offset < k2->offset)
1515                 return -1;
1516         return 0;
1517 }
1518
1519 /*
1520  * this is used by the defrag code to go through all the
1521  * leaves pointed to by a node and reallocate them so that
1522  * disk order is close to key order
1523  */
1524 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1525                        struct btrfs_root *root, struct extent_buffer *parent,
1526                        int start_slot, u64 *last_ret,
1527                        struct btrfs_key *progress)
1528 {
1529         struct btrfs_fs_info *fs_info = root->fs_info;
1530         struct extent_buffer *cur;
1531         u64 blocknr;
1532         u64 gen;
1533         u64 search_start = *last_ret;
1534         u64 last_block = 0;
1535         u64 other;
1536         u32 parent_nritems;
1537         int end_slot;
1538         int i;
1539         int err = 0;
1540         int parent_level;
1541         int uptodate;
1542         u32 blocksize;
1543         int progress_passed = 0;
1544         struct btrfs_disk_key disk_key;
1545
1546         parent_level = btrfs_header_level(parent);
1547
1548         WARN_ON(trans->transaction != fs_info->running_transaction);
1549         WARN_ON(trans->transid != fs_info->generation);
1550
1551         parent_nritems = btrfs_header_nritems(parent);
1552         blocksize = fs_info->nodesize;
1553         end_slot = parent_nritems - 1;
1554
1555         if (parent_nritems <= 1)
1556                 return 0;
1557
1558         btrfs_set_lock_blocking(parent);
1559
1560         for (i = start_slot; i <= end_slot; i++) {
1561                 struct btrfs_key first_key;
1562                 int close = 1;
1563
1564                 btrfs_node_key(parent, &disk_key, i);
1565                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1566                         continue;
1567
1568                 progress_passed = 1;
1569                 blocknr = btrfs_node_blockptr(parent, i);
1570                 gen = btrfs_node_ptr_generation(parent, i);
1571                 btrfs_node_key_to_cpu(parent, &first_key, i);
1572                 if (last_block == 0)
1573                         last_block = blocknr;
1574
1575                 if (i > 0) {
1576                         other = btrfs_node_blockptr(parent, i - 1);
1577                         close = close_blocks(blocknr, other, blocksize);
1578                 }
1579                 if (!close && i < end_slot) {
1580                         other = btrfs_node_blockptr(parent, i + 1);
1581                         close = close_blocks(blocknr, other, blocksize);
1582                 }
1583                 if (close) {
1584                         last_block = blocknr;
1585                         continue;
1586                 }
1587
1588                 cur = find_extent_buffer(fs_info, blocknr);
1589                 if (cur)
1590                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1591                 else
1592                         uptodate = 0;
1593                 if (!cur || !uptodate) {
1594                         if (!cur) {
1595                                 cur = read_tree_block(fs_info, blocknr, gen,
1596                                                       parent_level - 1,
1597                                                       &first_key);
1598                                 if (IS_ERR(cur)) {
1599                                         return PTR_ERR(cur);
1600                                 } else if (!extent_buffer_uptodate(cur)) {
1601                                         free_extent_buffer(cur);
1602                                         return -EIO;
1603                                 }
1604                         } else if (!uptodate) {
1605                                 err = btrfs_read_buffer(cur, gen,
1606                                                 parent_level - 1,&first_key);
1607                                 if (err) {
1608                                         free_extent_buffer(cur);
1609                                         return err;
1610                                 }
1611                         }
1612                 }
1613                 if (search_start == 0)
1614                         search_start = last_block;
1615
1616                 btrfs_tree_lock(cur);
1617                 btrfs_set_lock_blocking(cur);
1618                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1619                                         &cur, search_start,
1620                                         min(16 * blocksize,
1621                                             (end_slot - i) * blocksize));
1622                 if (err) {
1623                         btrfs_tree_unlock(cur);
1624                         free_extent_buffer(cur);
1625                         break;
1626                 }
1627                 search_start = cur->start;
1628                 last_block = cur->start;
1629                 *last_ret = search_start;
1630                 btrfs_tree_unlock(cur);
1631                 free_extent_buffer(cur);
1632         }
1633         return err;
1634 }
1635
1636 /*
1637  * search for key in the extent_buffer.  The items start at offset p,
1638  * and they are item_size apart.  There are 'max' items in p.
1639  *
1640  * the slot in the array is returned via slot, and it points to
1641  * the place where you would insert key if it is not found in
1642  * the array.
1643  *
1644  * slot may point to max if the key is bigger than all of the keys
1645  */
1646 static noinline int generic_bin_search(struct extent_buffer *eb,
1647                                        unsigned long p, int item_size,
1648                                        const struct btrfs_key *key,
1649                                        int max, int *slot)
1650 {
1651         int low = 0;
1652         int high = max;
1653         int mid;
1654         int ret;
1655         struct btrfs_disk_key *tmp = NULL;
1656         struct btrfs_disk_key unaligned;
1657         unsigned long offset;
1658         char *kaddr = NULL;
1659         unsigned long map_start = 0;
1660         unsigned long map_len = 0;
1661         int err;
1662
1663         if (low > high) {
1664                 btrfs_err(eb->fs_info,
1665                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1666                           __func__, low, high, eb->start,
1667                           btrfs_header_owner(eb), btrfs_header_level(eb));
1668                 return -EINVAL;
1669         }
1670
1671         while (low < high) {
1672                 mid = (low + high) / 2;
1673                 offset = p + mid * item_size;
1674
1675                 if (!kaddr || offset < map_start ||
1676                     (offset + sizeof(struct btrfs_disk_key)) >
1677                     map_start + map_len) {
1678
1679                         err = map_private_extent_buffer(eb, offset,
1680                                                 sizeof(struct btrfs_disk_key),
1681                                                 &kaddr, &map_start, &map_len);
1682
1683                         if (!err) {
1684                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1685                                                         map_start);
1686                         } else if (err == 1) {
1687                                 read_extent_buffer(eb, &unaligned,
1688                                                    offset, sizeof(unaligned));
1689                                 tmp = &unaligned;
1690                         } else {
1691                                 return err;
1692                         }
1693
1694                 } else {
1695                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1696                                                         map_start);
1697                 }
1698                 ret = comp_keys(tmp, key);
1699
1700                 if (ret < 0)
1701                         low = mid + 1;
1702                 else if (ret > 0)
1703                         high = mid;
1704                 else {
1705                         *slot = mid;
1706                         return 0;
1707                 }
1708         }
1709         *slot = low;
1710         return 1;
1711 }
1712
1713 /*
1714  * simple bin_search frontend that does the right thing for
1715  * leaves vs nodes
1716  */
1717 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1718                      int level, int *slot)
1719 {
1720         if (level == 0)
1721                 return generic_bin_search(eb,
1722                                           offsetof(struct btrfs_leaf, items),
1723                                           sizeof(struct btrfs_item),
1724                                           key, btrfs_header_nritems(eb),
1725                                           slot);
1726         else
1727                 return generic_bin_search(eb,
1728                                           offsetof(struct btrfs_node, ptrs),
1729                                           sizeof(struct btrfs_key_ptr),
1730                                           key, btrfs_header_nritems(eb),
1731                                           slot);
1732 }
1733
1734 static void root_add_used(struct btrfs_root *root, u32 size)
1735 {
1736         spin_lock(&root->accounting_lock);
1737         btrfs_set_root_used(&root->root_item,
1738                             btrfs_root_used(&root->root_item) + size);
1739         spin_unlock(&root->accounting_lock);
1740 }
1741
1742 static void root_sub_used(struct btrfs_root *root, u32 size)
1743 {
1744         spin_lock(&root->accounting_lock);
1745         btrfs_set_root_used(&root->root_item,
1746                             btrfs_root_used(&root->root_item) - size);
1747         spin_unlock(&root->accounting_lock);
1748 }
1749
1750 /* given a node and slot number, this reads the blocks it points to.  The
1751  * extent buffer is returned with a reference taken (but unlocked).
1752  */
1753 static noinline struct extent_buffer *
1754 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1755                int slot)
1756 {
1757         int level = btrfs_header_level(parent);
1758         struct extent_buffer *eb;
1759         struct btrfs_key first_key;
1760
1761         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1762                 return ERR_PTR(-ENOENT);
1763
1764         BUG_ON(level == 0);
1765
1766         btrfs_node_key_to_cpu(parent, &first_key, slot);
1767         eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1768                              btrfs_node_ptr_generation(parent, slot),
1769                              level - 1, &first_key);
1770         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1771                 free_extent_buffer(eb);
1772                 eb = ERR_PTR(-EIO);
1773         }
1774
1775         return eb;
1776 }
1777
1778 /*
1779  * node level balancing, used to make sure nodes are in proper order for
1780  * item deletion.  We balance from the top down, so we have to make sure
1781  * that a deletion won't leave an node completely empty later on.
1782  */
1783 static noinline int balance_level(struct btrfs_trans_handle *trans,
1784                          struct btrfs_root *root,
1785                          struct btrfs_path *path, int level)
1786 {
1787         struct btrfs_fs_info *fs_info = root->fs_info;
1788         struct extent_buffer *right = NULL;
1789         struct extent_buffer *mid;
1790         struct extent_buffer *left = NULL;
1791         struct extent_buffer *parent = NULL;
1792         int ret = 0;
1793         int wret;
1794         int pslot;
1795         int orig_slot = path->slots[level];
1796         u64 orig_ptr;
1797
1798         ASSERT(level > 0);
1799
1800         mid = path->nodes[level];
1801
1802         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1803                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1804         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1805
1806         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1807
1808         if (level < BTRFS_MAX_LEVEL - 1) {
1809                 parent = path->nodes[level + 1];
1810                 pslot = path->slots[level + 1];
1811         }
1812
1813         /*
1814          * deal with the case where there is only one pointer in the root
1815          * by promoting the node below to a root
1816          */
1817         if (!parent) {
1818                 struct extent_buffer *child;
1819
1820                 if (btrfs_header_nritems(mid) != 1)
1821                         return 0;
1822
1823                 /* promote the child to a root */
1824                 child = read_node_slot(fs_info, mid, 0);
1825                 if (IS_ERR(child)) {
1826                         ret = PTR_ERR(child);
1827                         btrfs_handle_fs_error(fs_info, ret, NULL);
1828                         goto enospc;
1829                 }
1830
1831                 btrfs_tree_lock(child);
1832                 btrfs_set_lock_blocking(child);
1833                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1834                 if (ret) {
1835                         btrfs_tree_unlock(child);
1836                         free_extent_buffer(child);
1837                         goto enospc;
1838                 }
1839
1840                 ret = tree_mod_log_insert_root(root->node, child, 1);
1841                 BUG_ON(ret < 0);
1842                 rcu_assign_pointer(root->node, child);
1843
1844                 add_root_to_dirty_list(root);
1845                 btrfs_tree_unlock(child);
1846
1847                 path->locks[level] = 0;
1848                 path->nodes[level] = NULL;
1849                 clean_tree_block(fs_info, mid);
1850                 btrfs_tree_unlock(mid);
1851                 /* once for the path */
1852                 free_extent_buffer(mid);
1853
1854                 root_sub_used(root, mid->len);
1855                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1856                 /* once for the root ptr */
1857                 free_extent_buffer_stale(mid);
1858                 return 0;
1859         }
1860         if (btrfs_header_nritems(mid) >
1861             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1862                 return 0;
1863
1864         left = read_node_slot(fs_info, parent, pslot - 1);
1865         if (IS_ERR(left))
1866                 left = NULL;
1867
1868         if (left) {
1869                 btrfs_tree_lock(left);
1870                 btrfs_set_lock_blocking(left);
1871                 wret = btrfs_cow_block(trans, root, left,
1872                                        parent, pslot - 1, &left);
1873                 if (wret) {
1874                         ret = wret;
1875                         goto enospc;
1876                 }
1877         }
1878
1879         right = read_node_slot(fs_info, parent, pslot + 1);
1880         if (IS_ERR(right))
1881                 right = NULL;
1882
1883         if (right) {
1884                 btrfs_tree_lock(right);
1885                 btrfs_set_lock_blocking(right);
1886                 wret = btrfs_cow_block(trans, root, right,
1887                                        parent, pslot + 1, &right);
1888                 if (wret) {
1889                         ret = wret;
1890                         goto enospc;
1891                 }
1892         }
1893
1894         /* first, try to make some room in the middle buffer */
1895         if (left) {
1896                 orig_slot += btrfs_header_nritems(left);
1897                 wret = push_node_left(trans, fs_info, left, mid, 1);
1898                 if (wret < 0)
1899                         ret = wret;
1900         }
1901
1902         /*
1903          * then try to empty the right most buffer into the middle
1904          */
1905         if (right) {
1906                 wret = push_node_left(trans, fs_info, mid, right, 1);
1907                 if (wret < 0 && wret != -ENOSPC)
1908                         ret = wret;
1909                 if (btrfs_header_nritems(right) == 0) {
1910                         clean_tree_block(fs_info, right);
1911                         btrfs_tree_unlock(right);
1912                         del_ptr(root, path, level + 1, pslot + 1);
1913                         root_sub_used(root, right->len);
1914                         btrfs_free_tree_block(trans, root, right, 0, 1);
1915                         free_extent_buffer_stale(right);
1916                         right = NULL;
1917                 } else {
1918                         struct btrfs_disk_key right_key;
1919                         btrfs_node_key(right, &right_key, 0);
1920                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1921                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1922                         BUG_ON(ret < 0);
1923                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1924                         btrfs_mark_buffer_dirty(parent);
1925                 }
1926         }
1927         if (btrfs_header_nritems(mid) == 1) {
1928                 /*
1929                  * we're not allowed to leave a node with one item in the
1930                  * tree during a delete.  A deletion from lower in the tree
1931                  * could try to delete the only pointer in this node.
1932                  * So, pull some keys from the left.
1933                  * There has to be a left pointer at this point because
1934                  * otherwise we would have pulled some pointers from the
1935                  * right
1936                  */
1937                 if (!left) {
1938                         ret = -EROFS;
1939                         btrfs_handle_fs_error(fs_info, ret, NULL);
1940                         goto enospc;
1941                 }
1942                 wret = balance_node_right(trans, fs_info, mid, left);
1943                 if (wret < 0) {
1944                         ret = wret;
1945                         goto enospc;
1946                 }
1947                 if (wret == 1) {
1948                         wret = push_node_left(trans, fs_info, left, mid, 1);
1949                         if (wret < 0)
1950                                 ret = wret;
1951                 }
1952                 BUG_ON(wret == 1);
1953         }
1954         if (btrfs_header_nritems(mid) == 0) {
1955                 clean_tree_block(fs_info, mid);
1956                 btrfs_tree_unlock(mid);
1957                 del_ptr(root, path, level + 1, pslot);
1958                 root_sub_used(root, mid->len);
1959                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1960                 free_extent_buffer_stale(mid);
1961                 mid = NULL;
1962         } else {
1963                 /* update the parent key to reflect our changes */
1964                 struct btrfs_disk_key mid_key;
1965                 btrfs_node_key(mid, &mid_key, 0);
1966                 ret = tree_mod_log_insert_key(parent, pslot,
1967                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1968                 BUG_ON(ret < 0);
1969                 btrfs_set_node_key(parent, &mid_key, pslot);
1970                 btrfs_mark_buffer_dirty(parent);
1971         }
1972
1973         /* update the path */
1974         if (left) {
1975                 if (btrfs_header_nritems(left) > orig_slot) {
1976                         extent_buffer_get(left);
1977                         /* left was locked after cow */
1978                         path->nodes[level] = left;
1979                         path->slots[level + 1] -= 1;
1980                         path->slots[level] = orig_slot;
1981                         if (mid) {
1982                                 btrfs_tree_unlock(mid);
1983                                 free_extent_buffer(mid);
1984                         }
1985                 } else {
1986                         orig_slot -= btrfs_header_nritems(left);
1987                         path->slots[level] = orig_slot;
1988                 }
1989         }
1990         /* double check we haven't messed things up */
1991         if (orig_ptr !=
1992             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1993                 BUG();
1994 enospc:
1995         if (right) {
1996                 btrfs_tree_unlock(right);
1997                 free_extent_buffer(right);
1998         }
1999         if (left) {
2000                 if (path->nodes[level] != left)
2001                         btrfs_tree_unlock(left);
2002                 free_extent_buffer(left);
2003         }
2004         return ret;
2005 }
2006
2007 /* Node balancing for insertion.  Here we only split or push nodes around
2008  * when they are completely full.  This is also done top down, so we
2009  * have to be pessimistic.
2010  */
2011 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2012                                           struct btrfs_root *root,
2013                                           struct btrfs_path *path, int level)
2014 {
2015         struct btrfs_fs_info *fs_info = root->fs_info;
2016         struct extent_buffer *right = NULL;
2017         struct extent_buffer *mid;
2018         struct extent_buffer *left = NULL;
2019         struct extent_buffer *parent = NULL;
2020         int ret = 0;
2021         int wret;
2022         int pslot;
2023         int orig_slot = path->slots[level];
2024
2025         if (level == 0)
2026                 return 1;
2027
2028         mid = path->nodes[level];
2029         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2030
2031         if (level < BTRFS_MAX_LEVEL - 1) {
2032                 parent = path->nodes[level + 1];
2033                 pslot = path->slots[level + 1];
2034         }
2035
2036         if (!parent)
2037                 return 1;
2038
2039         left = read_node_slot(fs_info, parent, pslot - 1);
2040         if (IS_ERR(left))
2041                 left = NULL;
2042
2043         /* first, try to make some room in the middle buffer */
2044         if (left) {
2045                 u32 left_nr;
2046
2047                 btrfs_tree_lock(left);
2048                 btrfs_set_lock_blocking(left);
2049
2050                 left_nr = btrfs_header_nritems(left);
2051                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2052                         wret = 1;
2053                 } else {
2054                         ret = btrfs_cow_block(trans, root, left, parent,
2055                                               pslot - 1, &left);
2056                         if (ret)
2057                                 wret = 1;
2058                         else {
2059                                 wret = push_node_left(trans, fs_info,
2060                                                       left, mid, 0);
2061                         }
2062                 }
2063                 if (wret < 0)
2064                         ret = wret;
2065                 if (wret == 0) {
2066                         struct btrfs_disk_key disk_key;
2067                         orig_slot += left_nr;
2068                         btrfs_node_key(mid, &disk_key, 0);
2069                         ret = tree_mod_log_insert_key(parent, pslot,
2070                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2071                         BUG_ON(ret < 0);
2072                         btrfs_set_node_key(parent, &disk_key, pslot);
2073                         btrfs_mark_buffer_dirty(parent);
2074                         if (btrfs_header_nritems(left) > orig_slot) {
2075                                 path->nodes[level] = left;
2076                                 path->slots[level + 1] -= 1;
2077                                 path->slots[level] = orig_slot;
2078                                 btrfs_tree_unlock(mid);
2079                                 free_extent_buffer(mid);
2080                         } else {
2081                                 orig_slot -=
2082                                         btrfs_header_nritems(left);
2083                                 path->slots[level] = orig_slot;
2084                                 btrfs_tree_unlock(left);
2085                                 free_extent_buffer(left);
2086                         }
2087                         return 0;
2088                 }
2089                 btrfs_tree_unlock(left);
2090                 free_extent_buffer(left);
2091         }
2092         right = read_node_slot(fs_info, parent, pslot + 1);
2093         if (IS_ERR(right))
2094                 right = NULL;
2095
2096         /*
2097          * then try to empty the right most buffer into the middle
2098          */
2099         if (right) {
2100                 u32 right_nr;
2101
2102                 btrfs_tree_lock(right);
2103                 btrfs_set_lock_blocking(right);
2104
2105                 right_nr = btrfs_header_nritems(right);
2106                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2107                         wret = 1;
2108                 } else {
2109                         ret = btrfs_cow_block(trans, root, right,
2110                                               parent, pslot + 1,
2111                                               &right);
2112                         if (ret)
2113                                 wret = 1;
2114                         else {
2115                                 wret = balance_node_right(trans, fs_info,
2116                                                           right, mid);
2117                         }
2118                 }
2119                 if (wret < 0)
2120                         ret = wret;
2121                 if (wret == 0) {
2122                         struct btrfs_disk_key disk_key;
2123
2124                         btrfs_node_key(right, &disk_key, 0);
2125                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2126                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2127                         BUG_ON(ret < 0);
2128                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2129                         btrfs_mark_buffer_dirty(parent);
2130
2131                         if (btrfs_header_nritems(mid) <= orig_slot) {
2132                                 path->nodes[level] = right;
2133                                 path->slots[level + 1] += 1;
2134                                 path->slots[level] = orig_slot -
2135                                         btrfs_header_nritems(mid);
2136                                 btrfs_tree_unlock(mid);
2137                                 free_extent_buffer(mid);
2138                         } else {
2139                                 btrfs_tree_unlock(right);
2140                                 free_extent_buffer(right);
2141                         }
2142                         return 0;
2143                 }
2144                 btrfs_tree_unlock(right);
2145                 free_extent_buffer(right);
2146         }
2147         return 1;
2148 }
2149
2150 /*
2151  * readahead one full node of leaves, finding things that are close
2152  * to the block in 'slot', and triggering ra on them.
2153  */
2154 static void reada_for_search(struct btrfs_fs_info *fs_info,
2155                              struct btrfs_path *path,
2156                              int level, int slot, u64 objectid)
2157 {
2158         struct extent_buffer *node;
2159         struct btrfs_disk_key disk_key;
2160         u32 nritems;
2161         u64 search;
2162         u64 target;
2163         u64 nread = 0;
2164         struct extent_buffer *eb;
2165         u32 nr;
2166         u32 blocksize;
2167         u32 nscan = 0;
2168
2169         if (level != 1)
2170                 return;
2171
2172         if (!path->nodes[level])
2173                 return;
2174
2175         node = path->nodes[level];
2176
2177         search = btrfs_node_blockptr(node, slot);
2178         blocksize = fs_info->nodesize;
2179         eb = find_extent_buffer(fs_info, search);
2180         if (eb) {
2181                 free_extent_buffer(eb);
2182                 return;
2183         }
2184
2185         target = search;
2186
2187         nritems = btrfs_header_nritems(node);
2188         nr = slot;
2189
2190         while (1) {
2191                 if (path->reada == READA_BACK) {
2192                         if (nr == 0)
2193                                 break;
2194                         nr--;
2195                 } else if (path->reada == READA_FORWARD) {
2196                         nr++;
2197                         if (nr >= nritems)
2198                                 break;
2199                 }
2200                 if (path->reada == READA_BACK && objectid) {
2201                         btrfs_node_key(node, &disk_key, nr);
2202                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2203                                 break;
2204                 }
2205                 search = btrfs_node_blockptr(node, nr);
2206                 if ((search <= target && target - search <= 65536) ||
2207                     (search > target && search - target <= 65536)) {
2208                         readahead_tree_block(fs_info, search);
2209                         nread += blocksize;
2210                 }
2211                 nscan++;
2212                 if ((nread > 65536 || nscan > 32))
2213                         break;
2214         }
2215 }
2216
2217 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2218                                        struct btrfs_path *path, int level)
2219 {
2220         int slot;
2221         int nritems;
2222         struct extent_buffer *parent;
2223         struct extent_buffer *eb;
2224         u64 gen;
2225         u64 block1 = 0;
2226         u64 block2 = 0;
2227
2228         parent = path->nodes[level + 1];
2229         if (!parent)
2230                 return;
2231
2232         nritems = btrfs_header_nritems(parent);
2233         slot = path->slots[level + 1];
2234
2235         if (slot > 0) {
2236                 block1 = btrfs_node_blockptr(parent, slot - 1);
2237                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2238                 eb = find_extent_buffer(fs_info, block1);
2239                 /*
2240                  * if we get -eagain from btrfs_buffer_uptodate, we
2241                  * don't want to return eagain here.  That will loop
2242                  * forever
2243                  */
2244                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2245                         block1 = 0;
2246                 free_extent_buffer(eb);
2247         }
2248         if (slot + 1 < nritems) {
2249                 block2 = btrfs_node_blockptr(parent, slot + 1);
2250                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2251                 eb = find_extent_buffer(fs_info, block2);
2252                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2253                         block2 = 0;
2254                 free_extent_buffer(eb);
2255         }
2256
2257         if (block1)
2258                 readahead_tree_block(fs_info, block1);
2259         if (block2)
2260                 readahead_tree_block(fs_info, block2);
2261 }
2262
2263
2264 /*
2265  * when we walk down the tree, it is usually safe to unlock the higher layers
2266  * in the tree.  The exceptions are when our path goes through slot 0, because
2267  * operations on the tree might require changing key pointers higher up in the
2268  * tree.
2269  *
2270  * callers might also have set path->keep_locks, which tells this code to keep
2271  * the lock if the path points to the last slot in the block.  This is part of
2272  * walking through the tree, and selecting the next slot in the higher block.
2273  *
2274  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2275  * if lowest_unlock is 1, level 0 won't be unlocked
2276  */
2277 static noinline void unlock_up(struct btrfs_path *path, int level,
2278                                int lowest_unlock, int min_write_lock_level,
2279                                int *write_lock_level)
2280 {
2281         int i;
2282         int skip_level = level;
2283         int no_skips = 0;
2284         struct extent_buffer *t;
2285
2286         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2287                 if (!path->nodes[i])
2288                         break;
2289                 if (!path->locks[i])
2290                         break;
2291                 if (!no_skips && path->slots[i] == 0) {
2292                         skip_level = i + 1;
2293                         continue;
2294                 }
2295                 if (!no_skips && path->keep_locks) {
2296                         u32 nritems;
2297                         t = path->nodes[i];
2298                         nritems = btrfs_header_nritems(t);
2299                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2300                                 skip_level = i + 1;
2301                                 continue;
2302                         }
2303                 }
2304                 if (skip_level < i && i >= lowest_unlock)
2305                         no_skips = 1;
2306
2307                 t = path->nodes[i];
2308                 if (i >= lowest_unlock && i > skip_level) {
2309                         btrfs_tree_unlock_rw(t, path->locks[i]);
2310                         path->locks[i] = 0;
2311                         if (write_lock_level &&
2312                             i > min_write_lock_level &&
2313                             i <= *write_lock_level) {
2314                                 *write_lock_level = i - 1;
2315                         }
2316                 }
2317         }
2318 }
2319
2320 /*
2321  * This releases any locks held in the path starting at level and
2322  * going all the way up to the root.
2323  *
2324  * btrfs_search_slot will keep the lock held on higher nodes in a few
2325  * corner cases, such as COW of the block at slot zero in the node.  This
2326  * ignores those rules, and it should only be called when there are no
2327  * more updates to be done higher up in the tree.
2328  */
2329 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2330 {
2331         int i;
2332
2333         if (path->keep_locks)
2334                 return;
2335
2336         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2337                 if (!path->nodes[i])
2338                         continue;
2339                 if (!path->locks[i])
2340                         continue;
2341                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2342                 path->locks[i] = 0;
2343         }
2344 }
2345
2346 /*
2347  * helper function for btrfs_search_slot.  The goal is to find a block
2348  * in cache without setting the path to blocking.  If we find the block
2349  * we return zero and the path is unchanged.
2350  *
2351  * If we can't find the block, we set the path blocking and do some
2352  * reada.  -EAGAIN is returned and the search must be repeated.
2353  */
2354 static int
2355 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2356                       struct extent_buffer **eb_ret, int level, int slot,
2357                       const struct btrfs_key *key)
2358 {
2359         struct btrfs_fs_info *fs_info = root->fs_info;
2360         u64 blocknr;
2361         u64 gen;
2362         struct extent_buffer *b = *eb_ret;
2363         struct extent_buffer *tmp;
2364         struct btrfs_key first_key;
2365         int ret;
2366         int parent_level;
2367
2368         blocknr = btrfs_node_blockptr(b, slot);
2369         gen = btrfs_node_ptr_generation(b, slot);
2370         parent_level = btrfs_header_level(b);
2371         btrfs_node_key_to_cpu(b, &first_key, slot);
2372
2373         tmp = find_extent_buffer(fs_info, blocknr);
2374         if (tmp) {
2375                 /* first we do an atomic uptodate check */
2376                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2377                         *eb_ret = tmp;
2378                         return 0;
2379                 }
2380
2381                 /* the pages were up to date, but we failed
2382                  * the generation number check.  Do a full
2383                  * read for the generation number that is correct.
2384                  * We must do this without dropping locks so
2385                  * we can trust our generation number
2386                  */
2387                 btrfs_set_path_blocking(p);
2388
2389                 /* now we're allowed to do a blocking uptodate check */
2390                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2391                 if (!ret) {
2392                         *eb_ret = tmp;
2393                         return 0;
2394                 }
2395                 free_extent_buffer(tmp);
2396                 btrfs_release_path(p);
2397                 return -EIO;
2398         }
2399
2400         /*
2401          * reduce lock contention at high levels
2402          * of the btree by dropping locks before
2403          * we read.  Don't release the lock on the current
2404          * level because we need to walk this node to figure
2405          * out which blocks to read.
2406          */
2407         btrfs_unlock_up_safe(p, level + 1);
2408         btrfs_set_path_blocking(p);
2409
2410         if (p->reada != READA_NONE)
2411                 reada_for_search(fs_info, p, level, slot, key->objectid);
2412
2413         ret = -EAGAIN;
2414         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2415                               &first_key);
2416         if (!IS_ERR(tmp)) {
2417                 /*
2418                  * If the read above didn't mark this buffer up to date,
2419                  * it will never end up being up to date.  Set ret to EIO now
2420                  * and give up so that our caller doesn't loop forever
2421                  * on our EAGAINs.
2422                  */
2423                 if (!extent_buffer_uptodate(tmp))
2424                         ret = -EIO;
2425                 free_extent_buffer(tmp);
2426         } else {
2427                 ret = PTR_ERR(tmp);
2428         }
2429
2430         btrfs_release_path(p);
2431         return ret;
2432 }
2433
2434 /*
2435  * helper function for btrfs_search_slot.  This does all of the checks
2436  * for node-level blocks and does any balancing required based on
2437  * the ins_len.
2438  *
2439  * If no extra work was required, zero is returned.  If we had to
2440  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2441  * start over
2442  */
2443 static int
2444 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2445                        struct btrfs_root *root, struct btrfs_path *p,
2446                        struct extent_buffer *b, int level, int ins_len,
2447                        int *write_lock_level)
2448 {
2449         struct btrfs_fs_info *fs_info = root->fs_info;
2450         int ret;
2451
2452         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2453             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2454                 int sret;
2455
2456                 if (*write_lock_level < level + 1) {
2457                         *write_lock_level = level + 1;
2458                         btrfs_release_path(p);
2459                         goto again;
2460                 }
2461
2462                 btrfs_set_path_blocking(p);
2463                 reada_for_balance(fs_info, p, level);
2464                 sret = split_node(trans, root, p, level);
2465
2466                 BUG_ON(sret > 0);
2467                 if (sret) {
2468                         ret = sret;
2469                         goto done;
2470                 }
2471                 b = p->nodes[level];
2472         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2473                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2474                 int sret;
2475
2476                 if (*write_lock_level < level + 1) {
2477                         *write_lock_level = level + 1;
2478                         btrfs_release_path(p);
2479                         goto again;
2480                 }
2481
2482                 btrfs_set_path_blocking(p);
2483                 reada_for_balance(fs_info, p, level);
2484                 sret = balance_level(trans, root, p, level);
2485
2486                 if (sret) {
2487                         ret = sret;
2488                         goto done;
2489                 }
2490                 b = p->nodes[level];
2491                 if (!b) {
2492                         btrfs_release_path(p);
2493                         goto again;
2494                 }
2495                 BUG_ON(btrfs_header_nritems(b) == 1);
2496         }
2497         return 0;
2498
2499 again:
2500         ret = -EAGAIN;
2501 done:
2502         return ret;
2503 }
2504
2505 static void key_search_validate(struct extent_buffer *b,
2506                                 const struct btrfs_key *key,
2507                                 int level)
2508 {
2509 #ifdef CONFIG_BTRFS_ASSERT
2510         struct btrfs_disk_key disk_key;
2511
2512         btrfs_cpu_key_to_disk(&disk_key, key);
2513
2514         if (level == 0)
2515                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2516                     offsetof(struct btrfs_leaf, items[0].key),
2517                     sizeof(disk_key)));
2518         else
2519                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2520                     offsetof(struct btrfs_node, ptrs[0].key),
2521                     sizeof(disk_key)));
2522 #endif
2523 }
2524
2525 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2526                       int level, int *prev_cmp, int *slot)
2527 {
2528         if (*prev_cmp != 0) {
2529                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2530                 return *prev_cmp;
2531         }
2532
2533         key_search_validate(b, key, level);
2534         *slot = 0;
2535
2536         return 0;
2537 }
2538
2539 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2540                 u64 iobjectid, u64 ioff, u8 key_type,
2541                 struct btrfs_key *found_key)
2542 {
2543         int ret;
2544         struct btrfs_key key;
2545         struct extent_buffer *eb;
2546
2547         ASSERT(path);
2548         ASSERT(found_key);
2549
2550         key.type = key_type;
2551         key.objectid = iobjectid;
2552         key.offset = ioff;
2553
2554         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2555         if (ret < 0)
2556                 return ret;
2557
2558         eb = path->nodes[0];
2559         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2560                 ret = btrfs_next_leaf(fs_root, path);
2561                 if (ret)
2562                         return ret;
2563                 eb = path->nodes[0];
2564         }
2565
2566         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2567         if (found_key->type != key.type ||
2568                         found_key->objectid != key.objectid)
2569                 return 1;
2570
2571         return 0;
2572 }
2573
2574 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2575                                                         struct btrfs_path *p,
2576                                                         int write_lock_level)
2577 {
2578         struct btrfs_fs_info *fs_info = root->fs_info;
2579         struct extent_buffer *b;
2580         int root_lock;
2581         int level = 0;
2582
2583         /* We try very hard to do read locks on the root */
2584         root_lock = BTRFS_READ_LOCK;
2585
2586         if (p->search_commit_root) {
2587                 /* The commit roots are read only so we always do read locks */
2588                 if (p->need_commit_sem)
2589                         down_read(&fs_info->commit_root_sem);
2590                 b = root->commit_root;
2591                 extent_buffer_get(b);
2592                 level = btrfs_header_level(b);
2593                 if (p->need_commit_sem)
2594                         up_read(&fs_info->commit_root_sem);
2595                 /*
2596                  * Ensure that all callers have set skip_locking when
2597                  * p->search_commit_root = 1.
2598                  */
2599                 ASSERT(p->skip_locking == 1);
2600
2601                 goto out;
2602         }
2603
2604         if (p->skip_locking) {
2605                 b = btrfs_root_node(root);
2606                 level = btrfs_header_level(b);
2607                 goto out;
2608         }
2609
2610         /*
2611          * If the level is set to maximum, we can skip trying to get the read
2612          * lock.
2613          */
2614         if (write_lock_level < BTRFS_MAX_LEVEL) {
2615                 /*
2616                  * We don't know the level of the root node until we actually
2617                  * have it read locked
2618                  */
2619                 b = btrfs_read_lock_root_node(root);
2620                 level = btrfs_header_level(b);
2621                 if (level > write_lock_level)
2622                         goto out;
2623
2624                 /* Whoops, must trade for write lock */
2625                 btrfs_tree_read_unlock(b);
2626                 free_extent_buffer(b);
2627         }
2628
2629         b = btrfs_lock_root_node(root);
2630         root_lock = BTRFS_WRITE_LOCK;
2631
2632         /* The level might have changed, check again */
2633         level = btrfs_header_level(b);
2634
2635 out:
2636         p->nodes[level] = b;
2637         if (!p->skip_locking)
2638                 p->locks[level] = root_lock;
2639         /*
2640          * Callers are responsible for dropping b's references.
2641          */
2642         return b;
2643 }
2644
2645
2646 /*
2647  * btrfs_search_slot - look for a key in a tree and perform necessary
2648  * modifications to preserve tree invariants.
2649  *
2650  * @trans:      Handle of transaction, used when modifying the tree
2651  * @p:          Holds all btree nodes along the search path
2652  * @root:       The root node of the tree
2653  * @key:        The key we are looking for
2654  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2655  *              deletions it's -1. 0 for plain searches
2656  * @cow:        boolean should CoW operations be performed. Must always be 1
2657  *              when modifying the tree.
2658  *
2659  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2660  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2661  *
2662  * If @key is found, 0 is returned and you can find the item in the leaf level
2663  * of the path (level 0)
2664  *
2665  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2666  * points to the slot where it should be inserted
2667  *
2668  * If an error is encountered while searching the tree a negative error number
2669  * is returned
2670  */
2671 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2672                       const struct btrfs_key *key, struct btrfs_path *p,
2673                       int ins_len, int cow)
2674 {
2675         struct btrfs_fs_info *fs_info = root->fs_info;
2676         struct extent_buffer *b;
2677         int slot;
2678         int ret;
2679         int err;
2680         int level;
2681         int lowest_unlock = 1;
2682         /* everything at write_lock_level or lower must be write locked */
2683         int write_lock_level = 0;
2684         u8 lowest_level = 0;
2685         int min_write_lock_level;
2686         int prev_cmp;
2687
2688         lowest_level = p->lowest_level;
2689         WARN_ON(lowest_level && ins_len > 0);
2690         WARN_ON(p->nodes[0] != NULL);
2691         BUG_ON(!cow && ins_len);
2692
2693         if (ins_len < 0) {
2694                 lowest_unlock = 2;
2695
2696                 /* when we are removing items, we might have to go up to level
2697                  * two as we update tree pointers  Make sure we keep write
2698                  * for those levels as well
2699                  */
2700                 write_lock_level = 2;
2701         } else if (ins_len > 0) {
2702                 /*
2703                  * for inserting items, make sure we have a write lock on
2704                  * level 1 so we can update keys
2705                  */
2706                 write_lock_level = 1;
2707         }
2708
2709         if (!cow)
2710                 write_lock_level = -1;
2711
2712         if (cow && (p->keep_locks || p->lowest_level))
2713                 write_lock_level = BTRFS_MAX_LEVEL;
2714
2715         min_write_lock_level = write_lock_level;
2716
2717 again:
2718         prev_cmp = -1;
2719         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2720
2721         while (b) {
2722                 level = btrfs_header_level(b);
2723
2724                 /*
2725                  * setup the path here so we can release it under lock
2726                  * contention with the cow code
2727                  */
2728                 if (cow) {
2729                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2730
2731                         /*
2732                          * if we don't really need to cow this block
2733                          * then we don't want to set the path blocking,
2734                          * so we test it here
2735                          */
2736                         if (!should_cow_block(trans, root, b)) {
2737                                 trans->dirty = true;
2738                                 goto cow_done;
2739                         }
2740
2741                         /*
2742                          * must have write locks on this node and the
2743                          * parent
2744                          */
2745                         if (level > write_lock_level ||
2746                             (level + 1 > write_lock_level &&
2747                             level + 1 < BTRFS_MAX_LEVEL &&
2748                             p->nodes[level + 1])) {
2749                                 write_lock_level = level + 1;
2750                                 btrfs_release_path(p);
2751                                 goto again;
2752                         }
2753
2754                         btrfs_set_path_blocking(p);
2755                         if (last_level)
2756                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2757                                                       &b);
2758                         else
2759                                 err = btrfs_cow_block(trans, root, b,
2760                                                       p->nodes[level + 1],
2761                                                       p->slots[level + 1], &b);
2762                         if (err) {
2763                                 ret = err;
2764                                 goto done;
2765                         }
2766                 }
2767 cow_done:
2768                 p->nodes[level] = b;
2769                 /*
2770                  * Leave path with blocking locks to avoid massive
2771                  * lock context switch, this is made on purpose.
2772                  */
2773
2774                 /*
2775                  * we have a lock on b and as long as we aren't changing
2776                  * the tree, there is no way to for the items in b to change.
2777                  * It is safe to drop the lock on our parent before we
2778                  * go through the expensive btree search on b.
2779                  *
2780                  * If we're inserting or deleting (ins_len != 0), then we might
2781                  * be changing slot zero, which may require changing the parent.
2782                  * So, we can't drop the lock until after we know which slot
2783                  * we're operating on.
2784                  */
2785                 if (!ins_len && !p->keep_locks) {
2786                         int u = level + 1;
2787
2788                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2789                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2790                                 p->locks[u] = 0;
2791                         }
2792                 }
2793
2794                 ret = key_search(b, key, level, &prev_cmp, &slot);
2795                 if (ret < 0)
2796                         goto done;
2797
2798                 if (level != 0) {
2799                         int dec = 0;
2800                         if (ret && slot > 0) {
2801                                 dec = 1;
2802                                 slot -= 1;
2803                         }
2804                         p->slots[level] = slot;
2805                         err = setup_nodes_for_search(trans, root, p, b, level,
2806                                              ins_len, &write_lock_level);
2807                         if (err == -EAGAIN)
2808                                 goto again;
2809                         if (err) {
2810                                 ret = err;
2811                                 goto done;
2812                         }
2813                         b = p->nodes[level];
2814                         slot = p->slots[level];
2815
2816                         /*
2817                          * slot 0 is special, if we change the key
2818                          * we have to update the parent pointer
2819                          * which means we must have a write lock
2820                          * on the parent
2821                          */
2822                         if (slot == 0 && ins_len &&
2823                             write_lock_level < level + 1) {
2824                                 write_lock_level = level + 1;
2825                                 btrfs_release_path(p);
2826                                 goto again;
2827                         }
2828
2829                         unlock_up(p, level, lowest_unlock,
2830                                   min_write_lock_level, &write_lock_level);
2831
2832                         if (level == lowest_level) {
2833                                 if (dec)
2834                                         p->slots[level]++;
2835                                 goto done;
2836                         }
2837
2838                         err = read_block_for_search(root, p, &b, level,
2839                                                     slot, key);
2840                         if (err == -EAGAIN)
2841                                 goto again;
2842                         if (err) {
2843                                 ret = err;
2844                                 goto done;
2845                         }
2846
2847                         if (!p->skip_locking) {
2848                                 level = btrfs_header_level(b);
2849                                 if (level <= write_lock_level) {
2850                                         err = btrfs_try_tree_write_lock(b);
2851                                         if (!err) {
2852                                                 btrfs_set_path_blocking(p);
2853                                                 btrfs_tree_lock(b);
2854                                         }
2855                                         p->locks[level] = BTRFS_WRITE_LOCK;
2856                                 } else {
2857                                         err = btrfs_tree_read_lock_atomic(b);
2858                                         if (!err) {
2859                                                 btrfs_set_path_blocking(p);
2860                                                 btrfs_tree_read_lock(b);
2861                                         }
2862                                         p->locks[level] = BTRFS_READ_LOCK;
2863                                 }
2864                                 p->nodes[level] = b;
2865                         }
2866                 } else {
2867                         p->slots[level] = slot;
2868                         if (ins_len > 0 &&
2869                             btrfs_leaf_free_space(fs_info, b) < ins_len) {
2870                                 if (write_lock_level < 1) {
2871                                         write_lock_level = 1;
2872                                         btrfs_release_path(p);
2873                                         goto again;
2874                                 }
2875
2876                                 btrfs_set_path_blocking(p);
2877                                 err = split_leaf(trans, root, key,
2878                                                  p, ins_len, ret == 0);
2879
2880                                 BUG_ON(err > 0);
2881                                 if (err) {
2882                                         ret = err;
2883                                         goto done;
2884                                 }
2885                         }
2886                         if (!p->search_for_split)
2887                                 unlock_up(p, level, lowest_unlock,
2888                                           min_write_lock_level, NULL);
2889                         goto done;
2890                 }
2891         }
2892         ret = 1;
2893 done:
2894         /*
2895          * we don't really know what they plan on doing with the path
2896          * from here on, so for now just mark it as blocking
2897          */
2898         if (!p->leave_spinning)
2899                 btrfs_set_path_blocking(p);
2900         if (ret < 0 && !p->skip_release_on_error)
2901                 btrfs_release_path(p);
2902         return ret;
2903 }
2904
2905 /*
2906  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2907  * current state of the tree together with the operations recorded in the tree
2908  * modification log to search for the key in a previous version of this tree, as
2909  * denoted by the time_seq parameter.
2910  *
2911  * Naturally, there is no support for insert, delete or cow operations.
2912  *
2913  * The resulting path and return value will be set up as if we called
2914  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2915  */
2916 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2917                           struct btrfs_path *p, u64 time_seq)
2918 {
2919         struct btrfs_fs_info *fs_info = root->fs_info;
2920         struct extent_buffer *b;
2921         int slot;
2922         int ret;
2923         int err;
2924         int level;
2925         int lowest_unlock = 1;
2926         u8 lowest_level = 0;
2927         int prev_cmp = -1;
2928
2929         lowest_level = p->lowest_level;
2930         WARN_ON(p->nodes[0] != NULL);
2931
2932         if (p->search_commit_root) {
2933                 BUG_ON(time_seq);
2934                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2935         }
2936
2937 again:
2938         b = get_old_root(root, time_seq);
2939         if (!b) {
2940                 ret = -EIO;
2941                 goto done;
2942         }
2943         level = btrfs_header_level(b);
2944         p->locks[level] = BTRFS_READ_LOCK;
2945
2946         while (b) {
2947                 level = btrfs_header_level(b);
2948                 p->nodes[level] = b;
2949
2950                 /*
2951                  * we have a lock on b and as long as we aren't changing
2952                  * the tree, there is no way to for the items in b to change.
2953                  * It is safe to drop the lock on our parent before we
2954                  * go through the expensive btree search on b.
2955                  */
2956                 btrfs_unlock_up_safe(p, level + 1);
2957
2958                 /*
2959                  * Since we can unwind ebs we want to do a real search every
2960                  * time.
2961                  */
2962                 prev_cmp = -1;
2963                 ret = key_search(b, key, level, &prev_cmp, &slot);
2964
2965                 if (level != 0) {
2966                         int dec = 0;
2967                         if (ret && slot > 0) {
2968                                 dec = 1;
2969                                 slot -= 1;
2970                         }
2971                         p->slots[level] = slot;
2972                         unlock_up(p, level, lowest_unlock, 0, NULL);
2973
2974                         if (level == lowest_level) {
2975                                 if (dec)
2976                                         p->slots[level]++;
2977                                 goto done;
2978                         }
2979
2980                         err = read_block_for_search(root, p, &b, level,
2981                                                     slot, key);
2982                         if (err == -EAGAIN)
2983                                 goto again;
2984                         if (err) {
2985                                 ret = err;
2986                                 goto done;
2987                         }
2988
2989                         level = btrfs_header_level(b);
2990                         err = btrfs_tree_read_lock_atomic(b);
2991                         if (!err) {
2992                                 btrfs_set_path_blocking(p);
2993                                 btrfs_tree_read_lock(b);
2994                         }
2995                         b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2996                         if (!b) {
2997                                 ret = -ENOMEM;
2998                                 goto done;
2999                         }
3000                         p->locks[level] = BTRFS_READ_LOCK;
3001                         p->nodes[level] = b;
3002                 } else {
3003                         p->slots[level] = slot;
3004                         unlock_up(p, level, lowest_unlock, 0, NULL);
3005                         goto done;
3006                 }
3007         }
3008         ret = 1;
3009 done:
3010         if (!p->leave_spinning)
3011                 btrfs_set_path_blocking(p);
3012         if (ret < 0)
3013                 btrfs_release_path(p);
3014
3015         return ret;
3016 }
3017
3018 /*
3019  * helper to use instead of search slot if no exact match is needed but
3020  * instead the next or previous item should be returned.
3021  * When find_higher is true, the next higher item is returned, the next lower
3022  * otherwise.
3023  * When return_any and find_higher are both true, and no higher item is found,
3024  * return the next lower instead.
3025  * When return_any is true and find_higher is false, and no lower item is found,
3026  * return the next higher instead.
3027  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3028  * < 0 on error
3029  */
3030 int btrfs_search_slot_for_read(struct btrfs_root *root,
3031                                const struct btrfs_key *key,
3032                                struct btrfs_path *p, int find_higher,
3033                                int return_any)
3034 {
3035         int ret;
3036         struct extent_buffer *leaf;
3037
3038 again:
3039         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3040         if (ret <= 0)
3041                 return ret;
3042         /*
3043          * a return value of 1 means the path is at the position where the
3044          * item should be inserted. Normally this is the next bigger item,
3045          * but in case the previous item is the last in a leaf, path points
3046          * to the first free slot in the previous leaf, i.e. at an invalid
3047          * item.
3048          */
3049         leaf = p->nodes[0];
3050
3051         if (find_higher) {
3052                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3053                         ret = btrfs_next_leaf(root, p);
3054                         if (ret <= 0)
3055                                 return ret;
3056                         if (!return_any)
3057                                 return 1;
3058                         /*
3059                          * no higher item found, return the next
3060                          * lower instead
3061                          */
3062                         return_any = 0;
3063                         find_higher = 0;
3064                         btrfs_release_path(p);
3065                         goto again;
3066                 }
3067         } else {
3068                 if (p->slots[0] == 0) {
3069                         ret = btrfs_prev_leaf(root, p);
3070                         if (ret < 0)
3071                                 return ret;
3072                         if (!ret) {
3073                                 leaf = p->nodes[0];
3074                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3075                                         p->slots[0]--;
3076                                 return 0;
3077                         }
3078                         if (!return_any)
3079                                 return 1;
3080                         /*
3081                          * no lower item found, return the next
3082                          * higher instead
3083                          */
3084                         return_any = 0;
3085                         find_higher = 1;
3086                         btrfs_release_path(p);
3087                         goto again;
3088                 } else {
3089                         --p->slots[0];
3090                 }
3091         }
3092         return 0;
3093 }
3094
3095 /*
3096  * adjust the pointers going up the tree, starting at level
3097  * making sure the right key of each node is points to 'key'.
3098  * This is used after shifting pointers to the left, so it stops
3099  * fixing up pointers when a given leaf/node is not in slot 0 of the
3100  * higher levels
3101  *
3102  */
3103 static void fixup_low_keys(struct btrfs_path *path,
3104                            struct btrfs_disk_key *key, int level)
3105 {
3106         int i;
3107         struct extent_buffer *t;
3108         int ret;
3109
3110         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3111                 int tslot = path->slots[i];
3112
3113                 if (!path->nodes[i])
3114                         break;
3115                 t = path->nodes[i];
3116                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3117                                 GFP_ATOMIC);
3118                 BUG_ON(ret < 0);
3119                 btrfs_set_node_key(t, key, tslot);
3120                 btrfs_mark_buffer_dirty(path->nodes[i]);
3121                 if (tslot != 0)
3122                         break;
3123         }
3124 }
3125
3126 /*
3127  * update item key.
3128  *
3129  * This function isn't completely safe. It's the caller's responsibility
3130  * that the new key won't break the order
3131  */
3132 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3133                              struct btrfs_path *path,
3134                              const struct btrfs_key *new_key)
3135 {
3136         struct btrfs_disk_key disk_key;
3137         struct extent_buffer *eb;
3138         int slot;
3139
3140         eb = path->nodes[0];
3141         slot = path->slots[0];
3142         if (slot > 0) {
3143                 btrfs_item_key(eb, &disk_key, slot - 1);
3144                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3145         }
3146         if (slot < btrfs_header_nritems(eb) - 1) {
3147                 btrfs_item_key(eb, &disk_key, slot + 1);
3148                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3149         }
3150
3151         btrfs_cpu_key_to_disk(&disk_key, new_key);
3152         btrfs_set_item_key(eb, &disk_key, slot);
3153         btrfs_mark_buffer_dirty(eb);
3154         if (slot == 0)
3155                 fixup_low_keys(path, &disk_key, 1);
3156 }
3157
3158 /*
3159  * try to push data from one node into the next node left in the
3160  * tree.
3161  *
3162  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3163  * error, and > 0 if there was no room in the left hand block.
3164  */
3165 static int push_node_left(struct btrfs_trans_handle *trans,
3166                           struct btrfs_fs_info *fs_info,
3167                           struct extent_buffer *dst,
3168                           struct extent_buffer *src, int empty)
3169 {
3170         int push_items = 0;
3171         int src_nritems;
3172         int dst_nritems;
3173         int ret = 0;
3174
3175         src_nritems = btrfs_header_nritems(src);
3176         dst_nritems = btrfs_header_nritems(dst);
3177         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3178         WARN_ON(btrfs_header_generation(src) != trans->transid);
3179         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3180
3181         if (!empty && src_nritems <= 8)
3182                 return 1;
3183
3184         if (push_items <= 0)
3185                 return 1;
3186
3187         if (empty) {
3188                 push_items = min(src_nritems, push_items);
3189                 if (push_items < src_nritems) {
3190                         /* leave at least 8 pointers in the node if
3191                          * we aren't going to empty it
3192                          */
3193                         if (src_nritems - push_items < 8) {
3194                                 if (push_items <= 8)
3195                                         return 1;
3196                                 push_items -= 8;
3197                         }
3198                 }
3199         } else
3200                 push_items = min(src_nritems - 8, push_items);
3201
3202         ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3203                                    push_items);
3204         if (ret) {
3205                 btrfs_abort_transaction(trans, ret);
3206                 return ret;
3207         }
3208         copy_extent_buffer(dst, src,
3209                            btrfs_node_key_ptr_offset(dst_nritems),
3210                            btrfs_node_key_ptr_offset(0),
3211                            push_items * sizeof(struct btrfs_key_ptr));
3212
3213         if (push_items < src_nritems) {
3214                 /*
3215                  * Don't call tree_mod_log_insert_move here, key removal was
3216                  * already fully logged by tree_mod_log_eb_copy above.
3217                  */
3218                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3219                                       btrfs_node_key_ptr_offset(push_items),
3220                                       (src_nritems - push_items) *
3221                                       sizeof(struct btrfs_key_ptr));
3222         }
3223         btrfs_set_header_nritems(src, src_nritems - push_items);
3224         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3225         btrfs_mark_buffer_dirty(src);
3226         btrfs_mark_buffer_dirty(dst);
3227
3228         return ret;
3229 }
3230
3231 /*
3232  * try to push data from one node into the next node right in the
3233  * tree.
3234  *
3235  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3236  * error, and > 0 if there was no room in the right hand block.
3237  *
3238  * this will  only push up to 1/2 the contents of the left node over
3239  */
3240 static int balance_node_right(struct btrfs_trans_handle *trans,
3241                               struct btrfs_fs_info *fs_info,
3242                               struct extent_buffer *dst,
3243                               struct extent_buffer *src)
3244 {
3245         int push_items = 0;
3246         int max_push;
3247         int src_nritems;
3248         int dst_nritems;
3249         int ret = 0;
3250
3251         WARN_ON(btrfs_header_generation(src) != trans->transid);
3252         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3253
3254         src_nritems = btrfs_header_nritems(src);
3255         dst_nritems = btrfs_header_nritems(dst);
3256         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3257         if (push_items <= 0)
3258                 return 1;
3259
3260         if (src_nritems < 4)
3261                 return 1;
3262
3263         max_push = src_nritems / 2 + 1;
3264         /* don't try to empty the node */
3265         if (max_push >= src_nritems)
3266                 return 1;
3267
3268         if (max_push < push_items)
3269                 push_items = max_push;
3270
3271         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3272         BUG_ON(ret < 0);
3273         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3274                                       btrfs_node_key_ptr_offset(0),
3275                                       (dst_nritems) *
3276                                       sizeof(struct btrfs_key_ptr));
3277
3278         ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3279                                    src_nritems - push_items, push_items);
3280         if (ret) {
3281                 btrfs_abort_transaction(trans, ret);
3282                 return ret;
3283         }
3284         copy_extent_buffer(dst, src,
3285                            btrfs_node_key_ptr_offset(0),
3286                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3287                            push_items * sizeof(struct btrfs_key_ptr));
3288
3289         btrfs_set_header_nritems(src, src_nritems - push_items);
3290         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3291
3292         btrfs_mark_buffer_dirty(src);
3293         btrfs_mark_buffer_dirty(dst);
3294
3295         return ret;
3296 }
3297
3298 /*
3299  * helper function to insert a new root level in the tree.
3300  * A new node is allocated, and a single item is inserted to
3301  * point to the existing root
3302  *
3303  * returns zero on success or < 0 on failure.
3304  */
3305 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3306                            struct btrfs_root *root,
3307                            struct btrfs_path *path, int level)
3308 {
3309         struct btrfs_fs_info *fs_info = root->fs_info;
3310         u64 lower_gen;
3311         struct extent_buffer *lower;
3312         struct extent_buffer *c;
3313         struct extent_buffer *old;
3314         struct btrfs_disk_key lower_key;
3315         int ret;
3316
3317         BUG_ON(path->nodes[level]);
3318         BUG_ON(path->nodes[level-1] != root->node);
3319
3320         lower = path->nodes[level-1];
3321         if (level == 1)
3322                 btrfs_item_key(lower, &lower_key, 0);
3323         else
3324                 btrfs_node_key(lower, &lower_key, 0);
3325
3326         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3327                                    &lower_key, level, root->node->start, 0);
3328         if (IS_ERR(c))
3329                 return PTR_ERR(c);
3330
3331         root_add_used(root, fs_info->nodesize);
3332
3333         btrfs_set_header_nritems(c, 1);
3334         btrfs_set_node_key(c, &lower_key, 0);
3335         btrfs_set_node_blockptr(c, 0, lower->start);
3336         lower_gen = btrfs_header_generation(lower);
3337         WARN_ON(lower_gen != trans->transid);
3338
3339         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3340
3341         btrfs_mark_buffer_dirty(c);
3342
3343         old = root->node;
3344         ret = tree_mod_log_insert_root(root->node, c, 0);
3345         BUG_ON(ret < 0);
3346         rcu_assign_pointer(root->node, c);
3347
3348         /* the super has an extra ref to root->node */
3349         free_extent_buffer(old);
3350
3351         add_root_to_dirty_list(root);
3352         extent_buffer_get(c);
3353         path->nodes[level] = c;
3354         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3355         path->slots[level] = 0;
3356         return 0;
3357 }
3358
3359 /*
3360  * worker function to insert a single pointer in a node.
3361  * the node should have enough room for the pointer already
3362  *
3363  * slot and level indicate where you want the key to go, and
3364  * blocknr is the block the key points to.
3365  */
3366 static void insert_ptr(struct btrfs_trans_handle *trans,
3367                        struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3368                        struct btrfs_disk_key *key, u64 bytenr,
3369                        int slot, int level)
3370 {
3371         struct extent_buffer *lower;
3372         int nritems;
3373         int ret;
3374
3375         BUG_ON(!path->nodes[level]);
3376         btrfs_assert_tree_locked(path->nodes[level]);
3377         lower = path->nodes[level];
3378         nritems = btrfs_header_nritems(lower);
3379         BUG_ON(slot > nritems);
3380         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3381         if (slot != nritems) {
3382                 if (level) {
3383                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3384                                         nritems - slot);
3385                         BUG_ON(ret < 0);
3386                 }
3387                 memmove_extent_buffer(lower,
3388                               btrfs_node_key_ptr_offset(slot + 1),
3389                               btrfs_node_key_ptr_offset(slot),
3390                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3391         }
3392         if (level) {
3393                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3394                                 GFP_NOFS);
3395                 BUG_ON(ret < 0);
3396         }
3397         btrfs_set_node_key(lower, key, slot);
3398         btrfs_set_node_blockptr(lower, slot, bytenr);
3399         WARN_ON(trans->transid == 0);
3400         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3401         btrfs_set_header_nritems(lower, nritems + 1);
3402         btrfs_mark_buffer_dirty(lower);
3403 }
3404
3405 /*
3406  * split the node at the specified level in path in two.
3407  * The path is corrected to point to the appropriate node after the split
3408  *
3409  * Before splitting this tries to make some room in the node by pushing
3410  * left and right, if either one works, it returns right away.
3411  *
3412  * returns 0 on success and < 0 on failure
3413  */
3414 static noinline int split_node(struct btrfs_trans_handle *trans,
3415                                struct btrfs_root *root,
3416                                struct btrfs_path *path, int level)
3417 {
3418         struct btrfs_fs_info *fs_info = root->fs_info;
3419         struct extent_buffer *c;
3420         struct extent_buffer *split;
3421         struct btrfs_disk_key disk_key;
3422         int mid;
3423         int ret;
3424         u32 c_nritems;
3425
3426         c = path->nodes[level];
3427         WARN_ON(btrfs_header_generation(c) != trans->transid);
3428         if (c == root->node) {
3429                 /*
3430                  * trying to split the root, lets make a new one
3431                  *
3432                  * tree mod log: We don't log_removal old root in
3433                  * insert_new_root, because that root buffer will be kept as a
3434                  * normal node. We are going to log removal of half of the
3435                  * elements below with tree_mod_log_eb_copy. We're holding a
3436                  * tree lock on the buffer, which is why we cannot race with
3437                  * other tree_mod_log users.
3438                  */
3439                 ret = insert_new_root(trans, root, path, level + 1);
3440                 if (ret)
3441                         return ret;
3442         } else {
3443                 ret = push_nodes_for_insert(trans, root, path, level);
3444                 c = path->nodes[level];
3445                 if (!ret && btrfs_header_nritems(c) <
3446                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3447                         return 0;
3448                 if (ret < 0)
3449                         return ret;
3450         }
3451
3452         c_nritems = btrfs_header_nritems(c);
3453         mid = (c_nritems + 1) / 2;
3454         btrfs_node_key(c, &disk_key, mid);
3455
3456         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3457                         &disk_key, level, c->start, 0);
3458         if (IS_ERR(split))
3459                 return PTR_ERR(split);
3460
3461         root_add_used(root, fs_info->nodesize);
3462         ASSERT(btrfs_header_level(c) == level);
3463
3464         ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3465         if (ret) {
3466                 btrfs_abort_transaction(trans, ret);
3467                 return ret;
3468         }
3469         copy_extent_buffer(split, c,
3470                            btrfs_node_key_ptr_offset(0),
3471                            btrfs_node_key_ptr_offset(mid),
3472                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3473         btrfs_set_header_nritems(split, c_nritems - mid);
3474         btrfs_set_header_nritems(c, mid);
3475         ret = 0;
3476
3477         btrfs_mark_buffer_dirty(c);
3478         btrfs_mark_buffer_dirty(split);
3479
3480         insert_ptr(trans, fs_info, path, &disk_key, split->start,
3481                    path->slots[level + 1] + 1, level + 1);
3482
3483         if (path->slots[level] >= mid) {
3484                 path->slots[level] -= mid;
3485                 btrfs_tree_unlock(c);
3486                 free_extent_buffer(c);
3487                 path->nodes[level] = split;
3488                 path->slots[level + 1] += 1;
3489         } else {
3490                 btrfs_tree_unlock(split);
3491                 free_extent_buffer(split);
3492         }
3493         return ret;
3494 }
3495
3496 /*
3497  * how many bytes are required to store the items in a leaf.  start
3498  * and nr indicate which items in the leaf to check.  This totals up the
3499  * space used both by the item structs and the item data
3500  */
3501 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3502 {
3503         struct btrfs_item *start_item;
3504         struct btrfs_item *end_item;
3505         struct btrfs_map_token token;
3506         int data_len;
3507         int nritems = btrfs_header_nritems(l);
3508         int end = min(nritems, start + nr) - 1;
3509
3510         if (!nr)
3511                 return 0;
3512         btrfs_init_map_token(&token);
3513         start_item = btrfs_item_nr(start);
3514         end_item = btrfs_item_nr(end);
3515         data_len = btrfs_token_item_offset(l, start_item, &token) +
3516                 btrfs_token_item_size(l, start_item, &token);
3517         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3518         data_len += sizeof(struct btrfs_item) * nr;
3519         WARN_ON(data_len < 0);
3520         return data_len;
3521 }
3522
3523 /*
3524  * The space between the end of the leaf items and
3525  * the start of the leaf data.  IOW, how much room
3526  * the leaf has left for both items and data
3527  */
3528 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3529                                    struct extent_buffer *leaf)
3530 {
3531         int nritems = btrfs_header_nritems(leaf);
3532         int ret;
3533
3534         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3535         if (ret < 0) {
3536                 btrfs_crit(fs_info,
3537                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3538                            ret,
3539                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3540                            leaf_space_used(leaf, 0, nritems), nritems);
3541         }
3542         return ret;
3543 }
3544
3545 /*
3546  * min slot controls the lowest index we're willing to push to the
3547  * right.  We'll push up to and including min_slot, but no lower
3548  */
3549 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3550                                       struct btrfs_path *path,
3551                                       int data_size, int empty,
3552                                       struct extent_buffer *right,
3553                                       int free_space, u32 left_nritems,
3554                                       u32 min_slot)
3555 {
3556         struct extent_buffer *left = path->nodes[0];
3557         struct extent_buffer *upper = path->nodes[1];
3558         struct btrfs_map_token token;
3559         struct btrfs_disk_key disk_key;
3560         int slot;
3561         u32 i;
3562         int push_space = 0;
3563         int push_items = 0;
3564         struct btrfs_item *item;
3565         u32 nr;
3566         u32 right_nritems;
3567         u32 data_end;
3568         u32 this_item_size;
3569
3570         btrfs_init_map_token(&token);
3571
3572         if (empty)
3573                 nr = 0;
3574         else
3575                 nr = max_t(u32, 1, min_slot);
3576
3577         if (path->slots[0] >= left_nritems)
3578                 push_space += data_size;
3579
3580         slot = path->slots[1];
3581         i = left_nritems - 1;
3582         while (i >= nr) {
3583                 item = btrfs_item_nr(i);
3584
3585                 if (!empty && push_items > 0) {
3586                         if (path->slots[0] > i)
3587                                 break;
3588                         if (path->slots[0] == i) {
3589                                 int space = btrfs_leaf_free_space(fs_info, left);
3590                                 if (space + push_space * 2 > free_space)
3591                                         break;
3592                         }
3593                 }
3594
3595                 if (path->slots[0] == i)
3596                         push_space += data_size;
3597
3598                 this_item_size = btrfs_item_size(left, item);
3599                 if (this_item_size + sizeof(*item) + push_space > free_space)
3600                         break;
3601
3602                 push_items++;
3603                 push_space += this_item_size + sizeof(*item);
3604                 if (i == 0)
3605                         break;
3606                 i--;
3607         }
3608
3609         if (push_items == 0)
3610                 goto out_unlock;
3611
3612         WARN_ON(!empty && push_items == left_nritems);
3613
3614         /* push left to right */
3615         right_nritems = btrfs_header_nritems(right);
3616
3617         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3618         push_space -= leaf_data_end(fs_info, left);
3619
3620         /* make room in the right data area */
3621         data_end = leaf_data_end(fs_info, right);
3622         memmove_extent_buffer(right,
3623                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3624                               BTRFS_LEAF_DATA_OFFSET + data_end,
3625                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3626
3627         /* copy from the left data area */
3628         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3629                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3630                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3631                      push_space);
3632
3633         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3634                               btrfs_item_nr_offset(0),
3635                               right_nritems * sizeof(struct btrfs_item));
3636
3637         /* copy the items from left to right */
3638         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3639                    btrfs_item_nr_offset(left_nritems - push_items),
3640                    push_items * sizeof(struct btrfs_item));
3641
3642         /* update the item pointers */
3643         right_nritems += push_items;
3644         btrfs_set_header_nritems(right, right_nritems);
3645         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3646         for (i = 0; i < right_nritems; i++) {
3647                 item = btrfs_item_nr(i);
3648                 push_space -= btrfs_token_item_size(right, item, &token);
3649                 btrfs_set_token_item_offset(right, item, push_space, &token);
3650         }
3651
3652         left_nritems -= push_items;
3653         btrfs_set_header_nritems(left, left_nritems);
3654
3655         if (left_nritems)
3656                 btrfs_mark_buffer_dirty(left);
3657         else
3658                 clean_tree_block(fs_info, left);
3659
3660         btrfs_mark_buffer_dirty(right);
3661
3662         btrfs_item_key(right, &disk_key, 0);
3663         btrfs_set_node_key(upper, &disk_key, slot + 1);
3664         btrfs_mark_buffer_dirty(upper);
3665
3666         /* then fixup the leaf pointer in the path */
3667         if (path->slots[0] >= left_nritems) {
3668                 path->slots[0] -= left_nritems;
3669                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3670                         clean_tree_block(fs_info, path->nodes[0]);
3671                 btrfs_tree_unlock(path->nodes[0]);
3672                 free_extent_buffer(path->nodes[0]);
3673                 path->nodes[0] = right;
3674                 path->slots[1] += 1;
3675         } else {
3676                 btrfs_tree_unlock(right);
3677                 free_extent_buffer(right);
3678         }
3679         return 0;
3680
3681 out_unlock:
3682         btrfs_tree_unlock(right);
3683         free_extent_buffer(right);
3684         return 1;
3685 }
3686
3687 /*
3688  * push some data in the path leaf to the right, trying to free up at
3689  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3690  *
3691  * returns 1 if the push failed because the other node didn't have enough
3692  * room, 0 if everything worked out and < 0 if there were major errors.
3693  *
3694  * this will push starting from min_slot to the end of the leaf.  It won't
3695  * push any slot lower than min_slot
3696  */
3697 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3698                            *root, struct btrfs_path *path,
3699                            int min_data_size, int data_size,
3700                            int empty, u32 min_slot)
3701 {
3702         struct btrfs_fs_info *fs_info = root->fs_info;
3703         struct extent_buffer *left = path->nodes[0];
3704         struct extent_buffer *right;
3705         struct extent_buffer *upper;
3706         int slot;
3707         int free_space;
3708         u32 left_nritems;
3709         int ret;
3710
3711         if (!path->nodes[1])
3712                 return 1;
3713
3714         slot = path->slots[1];
3715         upper = path->nodes[1];
3716         if (slot >= btrfs_header_nritems(upper) - 1)
3717                 return 1;
3718
3719         btrfs_assert_tree_locked(path->nodes[1]);
3720
3721         right = read_node_slot(fs_info, upper, slot + 1);
3722         /*
3723          * slot + 1 is not valid or we fail to read the right node,
3724          * no big deal, just return.
3725          */
3726         if (IS_ERR(right))
3727                 return 1;
3728
3729         btrfs_tree_lock(right);
3730         btrfs_set_lock_blocking(right);
3731
3732         free_space = btrfs_leaf_free_space(fs_info, right);
3733         if (free_space < data_size)
3734                 goto out_unlock;
3735
3736         /* cow and double check */
3737         ret = btrfs_cow_block(trans, root, right, upper,
3738                               slot + 1, &right);
3739         if (ret)
3740                 goto out_unlock;
3741
3742         free_space = btrfs_leaf_free_space(fs_info, right);
3743         if (free_space < data_size)
3744                 goto out_unlock;
3745
3746         left_nritems = btrfs_header_nritems(left);
3747         if (left_nritems == 0)
3748                 goto out_unlock;
3749
3750         if (path->slots[0] == left_nritems && !empty) {
3751                 /* Key greater than all keys in the leaf, right neighbor has
3752                  * enough room for it and we're not emptying our leaf to delete
3753                  * it, therefore use right neighbor to insert the new item and
3754                  * no need to touch/dirty our left leaft. */
3755                 btrfs_tree_unlock(left);
3756                 free_extent_buffer(left);
3757                 path->nodes[0] = right;
3758                 path->slots[0] = 0;
3759                 path->slots[1]++;
3760                 return 0;
3761         }
3762
3763         return __push_leaf_right(fs_info, path, min_data_size, empty,
3764                                 right, free_space, left_nritems, min_slot);
3765 out_unlock:
3766         btrfs_tree_unlock(right);
3767         free_extent_buffer(right);
3768         return 1;
3769 }
3770
3771 /*
3772  * push some data in the path leaf to the left, trying to free up at
3773  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3774  *
3775  * max_slot can put a limit on how far into the leaf we'll push items.  The
3776  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3777  * items
3778  */
3779 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3780                                      struct btrfs_path *path, int data_size,
3781                                      int empty, struct extent_buffer *left,
3782                                      int free_space, u32 right_nritems,
3783                                      u32 max_slot)
3784 {
3785         struct btrfs_disk_key disk_key;
3786         struct extent_buffer *right = path->nodes[0];
3787         int i;
3788         int push_space = 0;
3789         int push_items = 0;
3790         struct btrfs_item *item;
3791         u32 old_left_nritems;
3792         u32 nr;
3793         int ret = 0;
3794         u32 this_item_size;
3795         u32 old_left_item_size;
3796         struct btrfs_map_token token;
3797
3798         btrfs_init_map_token(&token);
3799
3800         if (empty)
3801                 nr = min(right_nritems, max_slot);
3802         else
3803                 nr = min(right_nritems - 1, max_slot);
3804
3805         for (i = 0; i < nr; i++) {
3806                 item = btrfs_item_nr(i);
3807
3808                 if (!empty && push_items > 0) {
3809                         if (path->slots[0] < i)
3810                                 break;
3811                         if (path->slots[0] == i) {
3812                                 int space = btrfs_leaf_free_space(fs_info, right);
3813                                 if (space + push_space * 2 > free_space)
3814                                         break;
3815                         }
3816                 }
3817
3818                 if (path->slots[0] == i)
3819                         push_space += data_size;
3820
3821                 this_item_size = btrfs_item_size(right, item);
3822                 if (this_item_size + sizeof(*item) + push_space > free_space)
3823                         break;
3824
3825                 push_items++;
3826                 push_space += this_item_size + sizeof(*item);
3827         }
3828
3829         if (push_items == 0) {
3830                 ret = 1;
3831                 goto out;
3832         }
3833         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3834
3835         /* push data from right to left */
3836         copy_extent_buffer(left, right,
3837                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3838                            btrfs_item_nr_offset(0),
3839                            push_items * sizeof(struct btrfs_item));
3840
3841         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3842                      btrfs_item_offset_nr(right, push_items - 1);
3843
3844         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3845                      leaf_data_end(fs_info, left) - push_space,
3846                      BTRFS_LEAF_DATA_OFFSET +
3847                      btrfs_item_offset_nr(right, push_items - 1),
3848                      push_space);
3849         old_left_nritems = btrfs_header_nritems(left);
3850         BUG_ON(old_left_nritems <= 0);
3851
3852         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3853         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3854                 u32 ioff;
3855
3856                 item = btrfs_item_nr(i);
3857
3858                 ioff = btrfs_token_item_offset(left, item, &token);
3859                 btrfs_set_token_item_offset(left, item,
3860                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3861                       &token);
3862         }
3863         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3864
3865         /* fixup right node */
3866         if (push_items > right_nritems)
3867                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3868                        right_nritems);
3869
3870         if (push_items < right_nritems) {
3871                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3872                                                   leaf_data_end(fs_info, right);
3873                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3874                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3875                                       BTRFS_LEAF_DATA_OFFSET +
3876                                       leaf_data_end(fs_info, right), push_space);
3877
3878                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3879                               btrfs_item_nr_offset(push_items),
3880                              (btrfs_header_nritems(right) - push_items) *
3881                              sizeof(struct btrfs_item));
3882         }
3883         right_nritems -= push_items;
3884         btrfs_set_header_nritems(right, right_nritems);
3885         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3886         for (i = 0; i < right_nritems; i++) {
3887                 item = btrfs_item_nr(i);
3888
3889                 push_space = push_space - btrfs_token_item_size(right,
3890                                                                 item, &token);
3891                 btrfs_set_token_item_offset(right, item, push_space, &token);
3892         }
3893
3894         btrfs_mark_buffer_dirty(left);
3895         if (right_nritems)
3896                 btrfs_mark_buffer_dirty(right);
3897         else
3898                 clean_tree_block(fs_info, right);
3899
3900         btrfs_item_key(right, &disk_key, 0);
3901         fixup_low_keys(path, &disk_key, 1);
3902
3903         /* then fixup the leaf pointer in the path */
3904         if (path->slots[0] < push_items) {
3905                 path->slots[0] += old_left_nritems;
3906                 btrfs_tree_unlock(path->nodes[0]);
3907                 free_extent_buffer(path->nodes[0]);
3908                 path->nodes[0] = left;
3909                 path->slots[1] -= 1;
3910         } else {
3911                 btrfs_tree_unlock(left);
3912                 free_extent_buffer(left);
3913                 path->slots[0] -= push_items;
3914         }
3915         BUG_ON(path->slots[0] < 0);
3916         return ret;
3917 out:
3918         btrfs_tree_unlock(left);
3919         free_extent_buffer(left);
3920         return ret;
3921 }
3922
3923 /*
3924  * push some data in the path leaf to the left, trying to free up at
3925  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3926  *
3927  * max_slot can put a limit on how far into the leaf we'll push items.  The
3928  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3929  * items
3930  */
3931 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3932                           *root, struct btrfs_path *path, int min_data_size,
3933                           int data_size, int empty, u32 max_slot)
3934 {
3935         struct btrfs_fs_info *fs_info = root->fs_info;
3936         struct extent_buffer *right = path->nodes[0];
3937         struct extent_buffer *left;
3938         int slot;
3939         int free_space;
3940         u32 right_nritems;
3941         int ret = 0;
3942
3943         slot = path->slots[1];
3944         if (slot == 0)
3945                 return 1;
3946         if (!path->nodes[1])
3947                 return 1;
3948
3949         right_nritems = btrfs_header_nritems(right);
3950         if (right_nritems == 0)
3951                 return 1;
3952
3953         btrfs_assert_tree_locked(path->nodes[1]);
3954
3955         left = read_node_slot(fs_info, path->nodes[1], slot - 1);
3956         /*
3957          * slot - 1 is not valid or we fail to read the left node,
3958          * no big deal, just return.
3959          */
3960         if (IS_ERR(left))
3961                 return 1;
3962
3963         btrfs_tree_lock(left);
3964         btrfs_set_lock_blocking(left);
3965
3966         free_space = btrfs_leaf_free_space(fs_info, left);
3967         if (free_space < data_size) {
3968                 ret = 1;
3969                 goto out;
3970         }
3971
3972         /* cow and double check */
3973         ret = btrfs_cow_block(trans, root, left,
3974                               path->nodes[1], slot - 1, &left);
3975         if (ret) {
3976                 /* we hit -ENOSPC, but it isn't fatal here */
3977                 if (ret == -ENOSPC)
3978                         ret = 1;
3979                 goto out;
3980         }
3981
3982         free_space = btrfs_leaf_free_space(fs_info, left);
3983         if (free_space < data_size) {
3984                 ret = 1;
3985                 goto out;
3986         }
3987
3988         return __push_leaf_left(fs_info, path, min_data_size,
3989                                empty, left, free_space, right_nritems,
3990                                max_slot);
3991 out:
3992         btrfs_tree_unlock(left);
3993         free_extent_buffer(left);
3994         return ret;
3995 }
3996
3997 /*
3998  * split the path's leaf in two, making sure there is at least data_size
3999  * available for the resulting leaf level of the path.
4000  */
4001 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4002                                     struct btrfs_fs_info *fs_info,
4003                                     struct btrfs_path *path,
4004                                     struct extent_buffer *l,
4005                                     struct extent_buffer *right,
4006                                     int slot, int mid, int nritems)
4007 {
4008         int data_copy_size;
4009         int rt_data_off;
4010         int i;
4011         struct btrfs_disk_key disk_key;
4012         struct btrfs_map_token token;
4013
4014         btrfs_init_map_token(&token);
4015
4016         nritems = nritems - mid;
4017         btrfs_set_header_nritems(right, nritems);
4018         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4019
4020         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4021                            btrfs_item_nr_offset(mid),
4022                            nritems * sizeof(struct btrfs_item));
4023
4024         copy_extent_buffer(right, l,
4025                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4026                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4027                      leaf_data_end(fs_info, l), data_copy_size);
4028
4029         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4030
4031         for (i = 0; i < nritems; i++) {
4032                 struct btrfs_item *item = btrfs_item_nr(i);
4033                 u32 ioff;
4034
4035                 ioff = btrfs_token_item_offset(right, item, &token);
4036                 btrfs_set_token_item_offset(right, item,
4037                                             ioff + rt_data_off, &token);
4038         }
4039
4040         btrfs_set_header_nritems(l, mid);
4041         btrfs_item_key(right, &disk_key, 0);
4042         insert_ptr(trans, fs_info, path, &disk_key, right->start,
4043                    path->slots[1] + 1, 1);
4044
4045         btrfs_mark_buffer_dirty(right);
4046         btrfs_mark_buffer_dirty(l);
4047         BUG_ON(path->slots[0] != slot);
4048
4049         if (mid <= slot) {
4050                 btrfs_tree_unlock(path->nodes[0]);
4051                 free_extent_buffer(path->nodes[0]);
4052                 path->nodes[0] = right;
4053                 path->slots[0] -= mid;
4054                 path->slots[1] += 1;
4055         } else {
4056                 btrfs_tree_unlock(right);
4057                 free_extent_buffer(right);
4058         }
4059
4060         BUG_ON(path->slots[0] < 0);
4061 }
4062
4063 /*
4064  * double splits happen when we need to insert a big item in the middle
4065  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4066  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4067  *          A                 B                 C
4068  *
4069  * We avoid this by trying to push the items on either side of our target
4070  * into the adjacent leaves.  If all goes well we can avoid the double split
4071  * completely.
4072  */
4073 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4074                                           struct btrfs_root *root,
4075                                           struct btrfs_path *path,
4076                                           int data_size)
4077 {
4078         struct btrfs_fs_info *fs_info = root->fs_info;
4079         int ret;
4080         int progress = 0;
4081         int slot;
4082         u32 nritems;
4083         int space_needed = data_size;
4084
4085         slot = path->slots[0];
4086         if (slot < btrfs_header_nritems(path->nodes[0]))
4087                 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4088
4089         /*
4090          * try to push all the items after our slot into the
4091          * right leaf
4092          */
4093         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4094         if (ret < 0)
4095                 return ret;
4096
4097         if (ret == 0)
4098                 progress++;
4099
4100         nritems = btrfs_header_nritems(path->nodes[0]);
4101         /*
4102          * our goal is to get our slot at the start or end of a leaf.  If
4103          * we've done so we're done
4104          */
4105         if (path->slots[0] == 0 || path->slots[0] == nritems)
4106                 return 0;
4107
4108         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4109                 return 0;
4110
4111         /* try to push all the items before our slot into the next leaf */
4112         slot = path->slots[0];
4113         space_needed = data_size;
4114         if (slot > 0)
4115                 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4116         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4117         if (ret < 0)
4118                 return ret;
4119
4120         if (ret == 0)
4121                 progress++;
4122
4123         if (progress)
4124                 return 0;
4125         return 1;
4126 }
4127
4128 /*
4129  * split the path's leaf in two, making sure there is at least data_size
4130  * available for the resulting leaf level of the path.
4131  *
4132  * returns 0 if all went well and < 0 on failure.
4133  */
4134 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4135                                struct btrfs_root *root,
4136                                const struct btrfs_key *ins_key,
4137                                struct btrfs_path *path, int data_size,
4138                                int extend)
4139 {
4140         struct btrfs_disk_key disk_key;
4141         struct extent_buffer *l;
4142         u32 nritems;
4143         int mid;
4144         int slot;
4145         struct extent_buffer *right;
4146         struct btrfs_fs_info *fs_info = root->fs_info;
4147         int ret = 0;
4148         int wret;
4149         int split;
4150         int num_doubles = 0;
4151         int tried_avoid_double = 0;
4152
4153         l = path->nodes[0];
4154         slot = path->slots[0];
4155         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4156             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4157                 return -EOVERFLOW;
4158
4159         /* first try to make some room by pushing left and right */
4160         if (data_size && path->nodes[1]) {
4161                 int space_needed = data_size;
4162
4163                 if (slot < btrfs_header_nritems(l))
4164                         space_needed -= btrfs_leaf_free_space(fs_info, l);
4165
4166                 wret = push_leaf_right(trans, root, path, space_needed,
4167                                        space_needed, 0, 0);
4168                 if (wret < 0)
4169                         return wret;
4170                 if (wret) {
4171                         space_needed = data_size;
4172                         if (slot > 0)
4173                                 space_needed -= btrfs_leaf_free_space(fs_info,
4174                                                                       l);
4175                         wret = push_leaf_left(trans, root, path, space_needed,
4176                                               space_needed, 0, (u32)-1);
4177                         if (wret < 0)
4178                                 return wret;
4179                 }
4180                 l = path->nodes[0];
4181
4182                 /* did the pushes work? */
4183                 if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4184                         return 0;
4185         }
4186
4187         if (!path->nodes[1]) {
4188                 ret = insert_new_root(trans, root, path, 1);
4189                 if (ret)
4190                         return ret;
4191         }
4192 again:
4193         split = 1;
4194         l = path->nodes[0];
4195         slot = path->slots[0];
4196         nritems = btrfs_header_nritems(l);
4197         mid = (nritems + 1) / 2;
4198
4199         if (mid <= slot) {
4200                 if (nritems == 1 ||
4201                     leaf_space_used(l, mid, nritems - mid) + data_size >
4202                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4203                         if (slot >= nritems) {
4204                                 split = 0;
4205                         } else {
4206                                 mid = slot;
4207                                 if (mid != nritems &&
4208                                     leaf_space_used(l, mid, nritems - mid) +
4209                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4210                                         if (data_size && !tried_avoid_double)
4211                                                 goto push_for_double;
4212                                         split = 2;
4213                                 }
4214                         }
4215                 }
4216         } else {
4217                 if (leaf_space_used(l, 0, mid) + data_size >
4218                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4219                         if (!extend && data_size && slot == 0) {
4220                                 split = 0;
4221                         } else if ((extend || !data_size) && slot == 0) {
4222                                 mid = 1;
4223                         } else {
4224                                 mid = slot;
4225                                 if (mid != nritems &&
4226                                     leaf_space_used(l, mid, nritems - mid) +
4227                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4228                                         if (data_size && !tried_avoid_double)
4229                                                 goto push_for_double;
4230                                         split = 2;
4231                                 }
4232                         }
4233                 }
4234         }
4235
4236         if (split == 0)
4237                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4238         else
4239                 btrfs_item_key(l, &disk_key, mid);
4240
4241         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4242                         &disk_key, 0, l->start, 0);
4243         if (IS_ERR(right))
4244                 return PTR_ERR(right);
4245
4246         root_add_used(root, fs_info->nodesize);
4247
4248         if (split == 0) {
4249                 if (mid <= slot) {
4250                         btrfs_set_header_nritems(right, 0);
4251                         insert_ptr(trans, fs_info, path, &disk_key,
4252                                    right->start, path->slots[1] + 1, 1);
4253                         btrfs_tree_unlock(path->nodes[0]);
4254                         free_extent_buffer(path->nodes[0]);
4255                         path->nodes[0] = right;
4256                         path->slots[0] = 0;
4257                         path->slots[1] += 1;
4258                 } else {
4259                         btrfs_set_header_nritems(right, 0);
4260                         insert_ptr(trans, fs_info, path, &disk_key,
4261                                    right->start, path->slots[1], 1);
4262                         btrfs_tree_unlock(path->nodes[0]);
4263                         free_extent_buffer(path->nodes[0]);
4264                         path->nodes[0] = right;
4265                         path->slots[0] = 0;
4266                         if (path->slots[1] == 0)
4267                                 fixup_low_keys(path, &disk_key, 1);
4268                 }
4269                 /*
4270                  * We create a new leaf 'right' for the required ins_len and
4271                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4272                  * the content of ins_len to 'right'.
4273                  */
4274                 return ret;
4275         }
4276
4277         copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4278
4279         if (split == 2) {
4280                 BUG_ON(num_doubles != 0);
4281                 num_doubles++;
4282                 goto again;
4283         }
4284
4285         return 0;
4286
4287 push_for_double:
4288         push_for_double_split(trans, root, path, data_size);
4289         tried_avoid_double = 1;
4290         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4291                 return 0;
4292         goto again;
4293 }
4294
4295 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4296                                          struct btrfs_root *root,
4297                                          struct btrfs_path *path, int ins_len)
4298 {
4299         struct btrfs_fs_info *fs_info = root->fs_info;
4300         struct btrfs_key key;
4301         struct extent_buffer *leaf;
4302         struct btrfs_file_extent_item *fi;
4303         u64 extent_len = 0;
4304         u32 item_size;
4305         int ret;
4306
4307         leaf = path->nodes[0];
4308         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4309
4310         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4311                key.type != BTRFS_EXTENT_CSUM_KEY);
4312
4313         if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4314                 return 0;
4315
4316         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4317         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4318                 fi = btrfs_item_ptr(leaf, path->slots[0],
4319                                     struct btrfs_file_extent_item);
4320                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4321         }
4322         btrfs_release_path(path);
4323
4324         path->keep_locks = 1;
4325         path->search_for_split = 1;
4326         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4327         path->search_for_split = 0;
4328         if (ret > 0)
4329                 ret = -EAGAIN;
4330         if (ret < 0)
4331                 goto err;
4332
4333         ret = -EAGAIN;
4334         leaf = path->nodes[0];
4335         /* if our item isn't there, return now */
4336         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4337                 goto err;
4338
4339         /* the leaf has  changed, it now has room.  return now */
4340         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4341                 goto err;
4342
4343         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4344                 fi = btrfs_item_ptr(leaf, path->slots[0],
4345                                     struct btrfs_file_extent_item);
4346                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4347                         goto err;
4348         }
4349
4350         btrfs_set_path_blocking(path);
4351         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4352         if (ret)
4353                 goto err;
4354
4355         path->keep_locks = 0;
4356         btrfs_unlock_up_safe(path, 1);
4357         return 0;
4358 err:
4359         path->keep_locks = 0;
4360         return ret;
4361 }
4362
4363 static noinline int split_item(struct btrfs_fs_info *fs_info,
4364                                struct btrfs_path *path,
4365                                const struct btrfs_key *new_key,
4366                                unsigned long split_offset)
4367 {
4368         struct extent_buffer *leaf;
4369         struct btrfs_item *item;
4370         struct btrfs_item *new_item;
4371         int slot;
4372         char *buf;
4373         u32 nritems;
4374         u32 item_size;
4375         u32 orig_offset;
4376         struct btrfs_disk_key disk_key;
4377
4378         leaf = path->nodes[0];
4379         BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4380
4381         btrfs_set_path_blocking(path);
4382
4383         item = btrfs_item_nr(path->slots[0]);
4384         orig_offset = btrfs_item_offset(leaf, item);
4385         item_size = btrfs_item_size(leaf, item);
4386
4387         buf = kmalloc(item_size, GFP_NOFS);
4388         if (!buf)
4389                 return -ENOMEM;
4390
4391         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4392                             path->slots[0]), item_size);
4393
4394         slot = path->slots[0] + 1;
4395         nritems = btrfs_header_nritems(leaf);
4396         if (slot != nritems) {
4397                 /* shift the items */
4398                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4399                                 btrfs_item_nr_offset(slot),
4400                                 (nritems - slot) * sizeof(struct btrfs_item));
4401         }
4402
4403         btrfs_cpu_key_to_disk(&disk_key, new_key);
4404         btrfs_set_item_key(leaf, &disk_key, slot);
4405
4406         new_item = btrfs_item_nr(slot);
4407
4408         btrfs_set_item_offset(leaf, new_item, orig_offset);
4409         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4410
4411         btrfs_set_item_offset(leaf, item,
4412                               orig_offset + item_size - split_offset);
4413         btrfs_set_item_size(leaf, item, split_offset);
4414
4415         btrfs_set_header_nritems(leaf, nritems + 1);
4416
4417         /* write the data for the start of the original item */
4418         write_extent_buffer(leaf, buf,
4419                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4420                             split_offset);
4421
4422         /* write the data for the new item */
4423         write_extent_buffer(leaf, buf + split_offset,
4424                             btrfs_item_ptr_offset(leaf, slot),
4425                             item_size - split_offset);
4426         btrfs_mark_buffer_dirty(leaf);
4427
4428         BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4429         kfree(buf);
4430         return 0;
4431 }
4432
4433 /*
4434  * This function splits a single item into two items,
4435  * giving 'new_key' to the new item and splitting the
4436  * old one at split_offset (from the start of the item).
4437  *
4438  * The path may be released by this operation.  After
4439  * the split, the path is pointing to the old item.  The
4440  * new item is going to be in the same node as the old one.
4441  *
4442  * Note, the item being split must be smaller enough to live alone on
4443  * a tree block with room for one extra struct btrfs_item
4444  *
4445  * This allows us to split the item in place, keeping a lock on the
4446  * leaf the entire time.
4447  */
4448 int btrfs_split_item(struct btrfs_trans_handle *trans,
4449                      struct btrfs_root *root,
4450                      struct btrfs_path *path,
4451                      const struct btrfs_key *new_key,
4452                      unsigned long split_offset)
4453 {
4454         int ret;
4455         ret = setup_leaf_for_split(trans, root, path,
4456                                    sizeof(struct btrfs_item));
4457         if (ret)
4458                 return ret;
4459
4460         ret = split_item(root->fs_info, path, new_key, split_offset);
4461         return ret;
4462 }
4463
4464 /*
4465  * This function duplicate a item, giving 'new_key' to the new item.
4466  * It guarantees both items live in the same tree leaf and the new item
4467  * is contiguous with the original item.
4468  *
4469  * This allows us to split file extent in place, keeping a lock on the
4470  * leaf the entire time.
4471  */
4472 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4473                          struct btrfs_root *root,
4474                          struct btrfs_path *path,
4475                          const struct btrfs_key *new_key)
4476 {
4477         struct extent_buffer *leaf;
4478         int ret;
4479         u32 item_size;
4480
4481         leaf = path->nodes[0];
4482         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4483         ret = setup_leaf_for_split(trans, root, path,
4484                                    item_size + sizeof(struct btrfs_item));
4485         if (ret)
4486                 return ret;
4487
4488         path->slots[0]++;
4489         setup_items_for_insert(root, path, new_key, &item_size,
4490                                item_size, item_size +
4491                                sizeof(struct btrfs_item), 1);
4492         leaf = path->nodes[0];
4493         memcpy_extent_buffer(leaf,
4494                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4495                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4496                              item_size);
4497         return 0;
4498 }
4499
4500 /*
4501  * make the item pointed to by the path smaller.  new_size indicates
4502  * how small to make it, and from_end tells us if we just chop bytes
4503  * off the end of the item or if we shift the item to chop bytes off
4504  * the front.
4505  */
4506 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4507                          struct btrfs_path *path, u32 new_size, int from_end)
4508 {
4509         int slot;
4510         struct extent_buffer *leaf;
4511         struct btrfs_item *item;
4512         u32 nritems;
4513         unsigned int data_end;
4514         unsigned int old_data_start;
4515         unsigned int old_size;
4516         unsigned int size_diff;
4517         int i;
4518         struct btrfs_map_token token;
4519
4520         btrfs_init_map_token(&token);
4521
4522         leaf = path->nodes[0];
4523         slot = path->slots[0];
4524
4525         old_size = btrfs_item_size_nr(leaf, slot);
4526         if (old_size == new_size)
4527                 return;
4528
4529         nritems = btrfs_header_nritems(leaf);
4530         data_end = leaf_data_end(fs_info, leaf);
4531
4532         old_data_start = btrfs_item_offset_nr(leaf, slot);
4533
4534         size_diff = old_size - new_size;
4535
4536         BUG_ON(slot < 0);
4537         BUG_ON(slot >= nritems);
4538
4539         /*
4540          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4541          */
4542         /* first correct the data pointers */
4543         for (i = slot; i < nritems; i++) {
4544                 u32 ioff;
4545                 item = btrfs_item_nr(i);
4546
4547                 ioff = btrfs_token_item_offset(leaf, item, &token);
4548                 btrfs_set_token_item_offset(leaf, item,
4549                                             ioff + size_diff, &token);
4550         }
4551
4552         /* shift the data */
4553         if (from_end) {
4554                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4555                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4556                               data_end, old_data_start + new_size - data_end);
4557         } else {
4558                 struct btrfs_disk_key disk_key;
4559                 u64 offset;
4560
4561                 btrfs_item_key(leaf, &disk_key, slot);
4562
4563                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4564                         unsigned long ptr;
4565                         struct btrfs_file_extent_item *fi;
4566
4567                         fi = btrfs_item_ptr(leaf, slot,
4568                                             struct btrfs_file_extent_item);
4569                         fi = (struct btrfs_file_extent_item *)(
4570                              (unsigned long)fi - size_diff);
4571
4572                         if (btrfs_file_extent_type(leaf, fi) ==
4573                             BTRFS_FILE_EXTENT_INLINE) {
4574                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4575                                 memmove_extent_buffer(leaf, ptr,
4576                                       (unsigned long)fi,
4577                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4578                         }
4579                 }
4580
4581                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4582                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4583                               data_end, old_data_start - data_end);
4584
4585                 offset = btrfs_disk_key_offset(&disk_key);
4586                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4587                 btrfs_set_item_key(leaf, &disk_key, slot);
4588                 if (slot == 0)
4589                         fixup_low_keys(path, &disk_key, 1);
4590         }
4591
4592         item = btrfs_item_nr(slot);
4593         btrfs_set_item_size(leaf, item, new_size);
4594         btrfs_mark_buffer_dirty(leaf);
4595
4596         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4597                 btrfs_print_leaf(leaf);
4598                 BUG();
4599         }
4600 }
4601
4602 /*
4603  * make the item pointed to by the path bigger, data_size is the added size.
4604  */
4605 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4606                        u32 data_size)
4607 {
4608         int slot;
4609         struct extent_buffer *leaf;
4610         struct btrfs_item *item;
4611         u32 nritems;
4612         unsigned int data_end;
4613         unsigned int old_data;
4614         unsigned int old_size;
4615         int i;
4616         struct btrfs_map_token token;
4617
4618         btrfs_init_map_token(&token);
4619
4620         leaf = path->nodes[0];
4621
4622         nritems = btrfs_header_nritems(leaf);
4623         data_end = leaf_data_end(fs_info, leaf);
4624
4625         if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4626                 btrfs_print_leaf(leaf);
4627                 BUG();
4628         }
4629         slot = path->slots[0];
4630         old_data = btrfs_item_end_nr(leaf, slot);
4631
4632         BUG_ON(slot < 0);
4633         if (slot >= nritems) {
4634                 btrfs_print_leaf(leaf);
4635                 btrfs_crit(fs_info, "slot %d too large, nritems %d",
4636                            slot, nritems);
4637                 BUG_ON(1);
4638         }
4639
4640         /*
4641          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4642          */
4643         /* first correct the data pointers */
4644         for (i = slot; i < nritems; i++) {
4645                 u32 ioff;
4646                 item = btrfs_item_nr(i);
4647
4648                 ioff = btrfs_token_item_offset(leaf, item, &token);
4649                 btrfs_set_token_item_offset(leaf, item,
4650                                             ioff - data_size, &token);
4651         }
4652
4653         /* shift the data */
4654         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4655                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4656                       data_end, old_data - data_end);
4657
4658         data_end = old_data;
4659         old_size = btrfs_item_size_nr(leaf, slot);
4660         item = btrfs_item_nr(slot);
4661         btrfs_set_item_size(leaf, item, old_size + data_size);
4662         btrfs_mark_buffer_dirty(leaf);
4663
4664         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4665                 btrfs_print_leaf(leaf);
4666                 BUG();
4667         }
4668 }
4669
4670 /*
4671  * this is a helper for btrfs_insert_empty_items, the main goal here is
4672  * to save stack depth by doing the bulk of the work in a function
4673  * that doesn't call btrfs_search_slot
4674  */
4675 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4676                             const struct btrfs_key *cpu_key, u32 *data_size,
4677                             u32 total_data, u32 total_size, int nr)
4678 {
4679         struct btrfs_fs_info *fs_info = root->fs_info;
4680         struct btrfs_item *item;
4681         int i;
4682         u32 nritems;
4683         unsigned int data_end;
4684         struct btrfs_disk_key disk_key;
4685         struct extent_buffer *leaf;
4686         int slot;
4687         struct btrfs_map_token token;
4688
4689         if (path->slots[0] == 0) {
4690                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4691                 fixup_low_keys(path, &disk_key, 1);
4692         }
4693         btrfs_unlock_up_safe(path, 1);
4694
4695         btrfs_init_map_token(&token);
4696
4697         leaf = path->nodes[0];
4698         slot = path->slots[0];
4699
4700         nritems = btrfs_header_nritems(leaf);
4701         data_end = leaf_data_end(fs_info, leaf);
4702
4703         if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4704                 btrfs_print_leaf(leaf);
4705                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4706                            total_size, btrfs_leaf_free_space(fs_info, leaf));
4707                 BUG();
4708         }
4709
4710         if (slot != nritems) {
4711                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4712
4713                 if (old_data < data_end) {
4714                         btrfs_print_leaf(leaf);
4715                         btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4716                                    slot, old_data, data_end);
4717                         BUG_ON(1);
4718                 }
4719                 /*
4720                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4721                  */
4722                 /* first correct the data pointers */
4723                 for (i = slot; i < nritems; i++) {
4724                         u32 ioff;
4725
4726                         item = btrfs_item_nr(i);
4727                         ioff = btrfs_token_item_offset(leaf, item, &token);
4728                         btrfs_set_token_item_offset(leaf, item,
4729                                                     ioff - total_data, &token);
4730                 }
4731                 /* shift the items */
4732                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4733                               btrfs_item_nr_offset(slot),
4734                               (nritems - slot) * sizeof(struct btrfs_item));
4735
4736                 /* shift the data */
4737                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4738                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4739                               data_end, old_data - data_end);
4740                 data_end = old_data;
4741         }
4742
4743         /* setup the item for the new data */
4744         for (i = 0; i < nr; i++) {
4745                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4746                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4747                 item = btrfs_item_nr(slot + i);
4748                 btrfs_set_token_item_offset(leaf, item,
4749                                             data_end - data_size[i], &token);
4750                 data_end -= data_size[i];
4751                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4752         }
4753
4754         btrfs_set_header_nritems(leaf, nritems + nr);
4755         btrfs_mark_buffer_dirty(leaf);
4756
4757         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4758                 btrfs_print_leaf(leaf);
4759                 BUG();
4760         }
4761 }
4762
4763 /*
4764  * Given a key and some data, insert items into the tree.
4765  * This does all the path init required, making room in the tree if needed.
4766  */
4767 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4768                             struct btrfs_root *root,
4769                             struct btrfs_path *path,
4770                             const struct btrfs_key *cpu_key, u32 *data_size,
4771                             int nr)
4772 {
4773         int ret = 0;
4774         int slot;
4775         int i;
4776         u32 total_size = 0;
4777         u32 total_data = 0;
4778
4779         for (i = 0; i < nr; i++)
4780                 total_data += data_size[i];
4781
4782         total_size = total_data + (nr * sizeof(struct btrfs_item));
4783         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4784         if (ret == 0)
4785                 return -EEXIST;
4786         if (ret < 0)
4787                 return ret;
4788
4789         slot = path->slots[0];
4790         BUG_ON(slot < 0);
4791
4792         setup_items_for_insert(root, path, cpu_key, data_size,
4793                                total_data, total_size, nr);
4794         return 0;
4795 }
4796
4797 /*
4798  * Given a key and some data, insert an item into the tree.
4799  * This does all the path init required, making room in the tree if needed.
4800  */
4801 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4802                       const struct btrfs_key *cpu_key, void *data,
4803                       u32 data_size)
4804 {
4805         int ret = 0;
4806         struct btrfs_path *path;
4807         struct extent_buffer *leaf;
4808         unsigned long ptr;
4809
4810         path = btrfs_alloc_path();
4811         if (!path)
4812                 return -ENOMEM;
4813         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4814         if (!ret) {
4815                 leaf = path->nodes[0];
4816                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4817                 write_extent_buffer(leaf, data, ptr, data_size);
4818                 btrfs_mark_buffer_dirty(leaf);
4819         }
4820         btrfs_free_path(path);
4821         return ret;
4822 }
4823
4824 /*
4825  * delete the pointer from a given node.
4826  *
4827  * the tree should have been previously balanced so the deletion does not
4828  * empty a node.
4829  */
4830 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4831                     int level, int slot)
4832 {
4833         struct extent_buffer *parent = path->nodes[level];
4834         u32 nritems;
4835         int ret;
4836
4837         nritems = btrfs_header_nritems(parent);
4838         if (slot != nritems - 1) {
4839                 if (level) {
4840                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4841                                         nritems - slot - 1);
4842                         BUG_ON(ret < 0);
4843                 }
4844                 memmove_extent_buffer(parent,
4845                               btrfs_node_key_ptr_offset(slot),
4846                               btrfs_node_key_ptr_offset(slot + 1),
4847                               sizeof(struct btrfs_key_ptr) *
4848                               (nritems - slot - 1));
4849         } else if (level) {
4850                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4851                                 GFP_NOFS);
4852                 BUG_ON(ret < 0);
4853         }
4854
4855         nritems--;
4856         btrfs_set_header_nritems(parent, nritems);
4857         if (nritems == 0 && parent == root->node) {
4858                 BUG_ON(btrfs_header_level(root->node) != 1);
4859                 /* just turn the root into a leaf and break */
4860                 btrfs_set_header_level(root->node, 0);
4861         } else if (slot == 0) {
4862                 struct btrfs_disk_key disk_key;
4863
4864                 btrfs_node_key(parent, &disk_key, 0);
4865                 fixup_low_keys(path, &disk_key, level + 1);
4866         }
4867         btrfs_mark_buffer_dirty(parent);
4868 }
4869
4870 /*
4871  * a helper function to delete the leaf pointed to by path->slots[1] and
4872  * path->nodes[1].
4873  *
4874  * This deletes the pointer in path->nodes[1] and frees the leaf
4875  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4876  *
4877  * The path must have already been setup for deleting the leaf, including
4878  * all the proper balancing.  path->nodes[1] must be locked.
4879  */
4880 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4881                                     struct btrfs_root *root,
4882                                     struct btrfs_path *path,
4883                                     struct extent_buffer *leaf)
4884 {
4885         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4886         del_ptr(root, path, 1, path->slots[1]);
4887
4888         /*
4889          * btrfs_free_extent is expensive, we want to make sure we
4890          * aren't holding any locks when we call it
4891          */
4892         btrfs_unlock_up_safe(path, 0);
4893
4894         root_sub_used(root, leaf->len);
4895
4896         extent_buffer_get(leaf);
4897         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4898         free_extent_buffer_stale(leaf);
4899 }
4900 /*
4901  * delete the item at the leaf level in path.  If that empties
4902  * the leaf, remove it from the tree
4903  */
4904 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4905                     struct btrfs_path *path, int slot, int nr)
4906 {
4907         struct btrfs_fs_info *fs_info = root->fs_info;
4908         struct extent_buffer *leaf;
4909         struct btrfs_item *item;
4910         u32 last_off;
4911         u32 dsize = 0;
4912         int ret = 0;
4913         int wret;
4914         int i;
4915         u32 nritems;
4916         struct btrfs_map_token token;
4917
4918         btrfs_init_map_token(&token);
4919
4920         leaf = path->nodes[0];
4921         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4922
4923         for (i = 0; i < nr; i++)
4924                 dsize += btrfs_item_size_nr(leaf, slot + i);
4925
4926         nritems = btrfs_header_nritems(leaf);
4927
4928         if (slot + nr != nritems) {
4929                 int data_end = leaf_data_end(fs_info, leaf);
4930
4931                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4932                               data_end + dsize,
4933                               BTRFS_LEAF_DATA_OFFSET + data_end,
4934                               last_off - data_end);
4935
4936                 for (i = slot + nr; i < nritems; i++) {
4937                         u32 ioff;
4938
4939                         item = btrfs_item_nr(i);
4940                         ioff = btrfs_token_item_offset(leaf, item, &token);
4941                         btrfs_set_token_item_offset(leaf, item,
4942                                                     ioff + dsize, &token);
4943                 }
4944
4945                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4946                               btrfs_item_nr_offset(slot + nr),
4947                               sizeof(struct btrfs_item) *
4948                               (nritems - slot - nr));
4949         }
4950         btrfs_set_header_nritems(leaf, nritems - nr);
4951         nritems -= nr;
4952
4953         /* delete the leaf if we've emptied it */
4954         if (nritems == 0) {
4955                 if (leaf == root->node) {
4956                         btrfs_set_header_level(leaf, 0);
4957                 } else {
4958                         btrfs_set_path_blocking(path);
4959                         clean_tree_block(fs_info, leaf);
4960                         btrfs_del_leaf(trans, root, path, leaf);
4961                 }
4962         } else {
4963                 int used = leaf_space_used(leaf, 0, nritems);
4964                 if (slot == 0) {
4965                         struct btrfs_disk_key disk_key;
4966
4967                         btrfs_item_key(leaf, &disk_key, 0);
4968                         fixup_low_keys(path, &disk_key, 1);
4969                 }
4970
4971                 /* delete the leaf if it is mostly empty */
4972                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4973                         /* push_leaf_left fixes the path.
4974                          * make sure the path still points to our leaf
4975                          * for possible call to del_ptr below
4976                          */
4977                         slot = path->slots[1];
4978                         extent_buffer_get(leaf);
4979
4980                         btrfs_set_path_blocking(path);
4981                         wret = push_leaf_left(trans, root, path, 1, 1,
4982                                               1, (u32)-1);
4983                         if (wret < 0 && wret != -ENOSPC)
4984                                 ret = wret;
4985
4986                         if (path->nodes[0] == leaf &&
4987                             btrfs_header_nritems(leaf)) {
4988                                 wret = push_leaf_right(trans, root, path, 1,
4989                                                        1, 1, 0);
4990                                 if (wret < 0 && wret != -ENOSPC)
4991                                         ret = wret;
4992                         }
4993
4994                         if (btrfs_header_nritems(leaf) == 0) {
4995                                 path->slots[1] = slot;
4996                                 btrfs_del_leaf(trans, root, path, leaf);
4997                                 free_extent_buffer(leaf);
4998                                 ret = 0;
4999                         } else {
5000                                 /* if we're still in the path, make sure
5001                                  * we're dirty.  Otherwise, one of the
5002                                  * push_leaf functions must have already
5003                                  * dirtied this buffer
5004                                  */
5005                                 if (path->nodes[0] == leaf)
5006                                         btrfs_mark_buffer_dirty(leaf);
5007                                 free_extent_buffer(leaf);
5008                         }
5009                 } else {
5010                         btrfs_mark_buffer_dirty(leaf);
5011                 }
5012         }
5013         return ret;
5014 }
5015
5016 /*
5017  * search the tree again to find a leaf with lesser keys
5018  * returns 0 if it found something or 1 if there are no lesser leaves.
5019  * returns < 0 on io errors.
5020  *
5021  * This may release the path, and so you may lose any locks held at the
5022  * time you call it.
5023  */
5024 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5025 {
5026         struct btrfs_key key;
5027         struct btrfs_disk_key found_key;
5028         int ret;
5029
5030         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5031
5032         if (key.offset > 0) {
5033                 key.offset--;
5034         } else if (key.type > 0) {
5035                 key.type--;
5036                 key.offset = (u64)-1;
5037         } else if (key.objectid > 0) {
5038                 key.objectid--;
5039                 key.type = (u8)-1;
5040                 key.offset = (u64)-1;
5041         } else {
5042                 return 1;
5043         }
5044
5045         btrfs_release_path(path);
5046         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5047         if (ret < 0)
5048                 return ret;
5049         btrfs_item_key(path->nodes[0], &found_key, 0);
5050         ret = comp_keys(&found_key, &key);
5051         /*
5052          * We might have had an item with the previous key in the tree right
5053          * before we released our path. And after we released our path, that
5054          * item might have been pushed to the first slot (0) of the leaf we
5055          * were holding due to a tree balance. Alternatively, an item with the
5056          * previous key can exist as the only element of a leaf (big fat item).
5057          * Therefore account for these 2 cases, so that our callers (like
5058          * btrfs_previous_item) don't miss an existing item with a key matching
5059          * the previous key we computed above.
5060          */
5061         if (ret <= 0)
5062                 return 0;
5063         return 1;
5064 }
5065
5066 /*
5067  * A helper function to walk down the tree starting at min_key, and looking
5068  * for nodes or leaves that are have a minimum transaction id.
5069  * This is used by the btree defrag code, and tree logging
5070  *
5071  * This does not cow, but it does stuff the starting key it finds back
5072  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5073  * key and get a writable path.
5074  *
5075  * This honors path->lowest_level to prevent descent past a given level
5076  * of the tree.
5077  *
5078  * min_trans indicates the oldest transaction that you are interested
5079  * in walking through.  Any nodes or leaves older than min_trans are
5080  * skipped over (without reading them).
5081  *
5082  * returns zero if something useful was found, < 0 on error and 1 if there
5083  * was nothing in the tree that matched the search criteria.
5084  */
5085 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5086                          struct btrfs_path *path,
5087                          u64 min_trans)
5088 {
5089         struct btrfs_fs_info *fs_info = root->fs_info;
5090         struct extent_buffer *cur;
5091         struct btrfs_key found_key;
5092         int slot;
5093         int sret;
5094         u32 nritems;
5095         int level;
5096         int ret = 1;
5097         int keep_locks = path->keep_locks;
5098
5099         path->keep_locks = 1;
5100 again:
5101         cur = btrfs_read_lock_root_node(root);
5102         level = btrfs_header_level(cur);
5103         WARN_ON(path->nodes[level]);
5104         path->nodes[level] = cur;
5105         path->locks[level] = BTRFS_READ_LOCK;
5106
5107         if (btrfs_header_generation(cur) < min_trans) {
5108                 ret = 1;
5109                 goto out;
5110         }
5111         while (1) {
5112                 nritems = btrfs_header_nritems(cur);
5113                 level = btrfs_header_level(cur);
5114                 sret = btrfs_bin_search(cur, min_key, level, &slot);
5115
5116                 /* at the lowest level, we're done, setup the path and exit */
5117                 if (level == path->lowest_level) {
5118                         if (slot >= nritems)
5119                                 goto find_next_key;
5120                         ret = 0;
5121                         path->slots[level] = slot;
5122                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5123                         goto out;
5124                 }
5125                 if (sret && slot > 0)
5126                         slot--;
5127                 /*
5128                  * check this node pointer against the min_trans parameters.
5129                  * If it is too old, old, skip to the next one.
5130                  */
5131                 while (slot < nritems) {
5132                         u64 gen;
5133
5134                         gen = btrfs_node_ptr_generation(cur, slot);
5135                         if (gen < min_trans) {
5136                                 slot++;
5137                                 continue;
5138                         }
5139                         break;
5140                 }
5141 find_next_key:
5142                 /*
5143                  * we didn't find a candidate key in this node, walk forward
5144                  * and find another one
5145                  */
5146                 if (slot >= nritems) {
5147                         path->slots[level] = slot;
5148                         btrfs_set_path_blocking(path);
5149                         sret = btrfs_find_next_key(root, path, min_key, level,
5150                                                   min_trans);
5151                         if (sret == 0) {
5152                                 btrfs_release_path(path);
5153                                 goto again;
5154                         } else {
5155                                 goto out;
5156                         }
5157                 }
5158                 /* save our key for returning back */
5159                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5160                 path->slots[level] = slot;
5161                 if (level == path->lowest_level) {
5162                         ret = 0;
5163                         goto out;
5164                 }
5165                 btrfs_set_path_blocking(path);
5166                 cur = read_node_slot(fs_info, cur, slot);
5167                 if (IS_ERR(cur)) {
5168                         ret = PTR_ERR(cur);
5169                         goto out;
5170                 }
5171
5172                 btrfs_tree_read_lock(cur);
5173
5174                 path->locks[level - 1] = BTRFS_READ_LOCK;
5175                 path->nodes[level - 1] = cur;
5176                 unlock_up(path, level, 1, 0, NULL);
5177         }
5178 out:
5179         path->keep_locks = keep_locks;
5180         if (ret == 0) {
5181                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5182                 btrfs_set_path_blocking(path);
5183                 memcpy(min_key, &found_key, sizeof(found_key));
5184         }
5185         return ret;
5186 }
5187
5188 static int tree_move_down(struct btrfs_fs_info *fs_info,
5189                            struct btrfs_path *path,
5190                            int *level)
5191 {
5192         struct extent_buffer *eb;
5193
5194         BUG_ON(*level == 0);
5195         eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5196         if (IS_ERR(eb))
5197                 return PTR_ERR(eb);
5198
5199         path->nodes[*level - 1] = eb;
5200         path->slots[*level - 1] = 0;
5201         (*level)--;
5202         return 0;
5203 }
5204
5205 static int tree_move_next_or_upnext(struct btrfs_path *path,
5206                                     int *level, int root_level)
5207 {
5208         int ret = 0;
5209         int nritems;
5210         nritems = btrfs_header_nritems(path->nodes[*level]);
5211
5212         path->slots[*level]++;
5213
5214         while (path->slots[*level] >= nritems) {
5215                 if (*level == root_level)
5216                         return -1;
5217
5218                 /* move upnext */
5219                 path->slots[*level] = 0;
5220                 free_extent_buffer(path->nodes[*level]);
5221                 path->nodes[*level] = NULL;
5222                 (*level)++;
5223                 path->slots[*level]++;
5224
5225                 nritems = btrfs_header_nritems(path->nodes[*level]);
5226                 ret = 1;
5227         }
5228         return ret;
5229 }
5230
5231 /*
5232  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5233  * or down.
5234  */
5235 static int tree_advance(struct btrfs_fs_info *fs_info,
5236                         struct btrfs_path *path,
5237                         int *level, int root_level,
5238                         int allow_down,
5239                         struct btrfs_key *key)
5240 {
5241         int ret;
5242
5243         if (*level == 0 || !allow_down) {
5244                 ret = tree_move_next_or_upnext(path, level, root_level);
5245         } else {
5246                 ret = tree_move_down(fs_info, path, level);
5247         }
5248         if (ret >= 0) {
5249                 if (*level == 0)
5250                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5251                                         path->slots[*level]);
5252                 else
5253                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5254                                         path->slots[*level]);
5255         }
5256         return ret;
5257 }
5258
5259 static int tree_compare_item(struct btrfs_path *left_path,
5260                              struct btrfs_path *right_path,
5261                              char *tmp_buf)
5262 {
5263         int cmp;
5264         int len1, len2;
5265         unsigned long off1, off2;
5266
5267         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5268         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5269         if (len1 != len2)
5270                 return 1;
5271
5272         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5273         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5274                                 right_path->slots[0]);
5275
5276         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5277
5278         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5279         if (cmp)
5280                 return 1;
5281         return 0;
5282 }
5283
5284 #define ADVANCE 1
5285 #define ADVANCE_ONLY_NEXT -1
5286
5287 /*
5288  * This function compares two trees and calls the provided callback for
5289  * every changed/new/deleted item it finds.
5290  * If shared tree blocks are encountered, whole subtrees are skipped, making
5291  * the compare pretty fast on snapshotted subvolumes.
5292  *
5293  * This currently works on commit roots only. As commit roots are read only,
5294  * we don't do any locking. The commit roots are protected with transactions.
5295  * Transactions are ended and rejoined when a commit is tried in between.
5296  *
5297  * This function checks for modifications done to the trees while comparing.
5298  * If it detects a change, it aborts immediately.
5299  */
5300 int btrfs_compare_trees(struct btrfs_root *left_root,
5301                         struct btrfs_root *right_root,
5302                         btrfs_changed_cb_t changed_cb, void *ctx)
5303 {
5304         struct btrfs_fs_info *fs_info = left_root->fs_info;
5305         int ret;
5306         int cmp;
5307         struct btrfs_path *left_path = NULL;
5308         struct btrfs_path *right_path = NULL;
5309         struct btrfs_key left_key;
5310         struct btrfs_key right_key;
5311         char *tmp_buf = NULL;
5312         int left_root_level;
5313         int right_root_level;
5314         int left_level;
5315         int right_level;
5316         int left_end_reached;
5317         int right_end_reached;
5318         int advance_left;
5319         int advance_right;
5320         u64 left_blockptr;
5321         u64 right_blockptr;
5322         u64 left_gen;
5323         u64 right_gen;
5324
5325         left_path = btrfs_alloc_path();
5326         if (!left_path) {
5327                 ret = -ENOMEM;
5328                 goto out;
5329         }
5330         right_path = btrfs_alloc_path();
5331         if (!right_path) {
5332                 ret = -ENOMEM;
5333                 goto out;
5334         }
5335
5336         tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5337         if (!tmp_buf) {
5338                 ret = -ENOMEM;
5339                 goto out;
5340         }
5341
5342         left_path->search_commit_root = 1;
5343         left_path->skip_locking = 1;
5344         right_path->search_commit_root = 1;
5345         right_path->skip_locking = 1;
5346
5347         /*
5348          * Strategy: Go to the first items of both trees. Then do
5349          *
5350          * If both trees are at level 0
5351          *   Compare keys of current items
5352          *     If left < right treat left item as new, advance left tree
5353          *       and repeat
5354          *     If left > right treat right item as deleted, advance right tree
5355          *       and repeat
5356          *     If left == right do deep compare of items, treat as changed if
5357          *       needed, advance both trees and repeat
5358          * If both trees are at the same level but not at level 0
5359          *   Compare keys of current nodes/leafs
5360          *     If left < right advance left tree and repeat
5361          *     If left > right advance right tree and repeat
5362          *     If left == right compare blockptrs of the next nodes/leafs
5363          *       If they match advance both trees but stay at the same level
5364          *         and repeat
5365          *       If they don't match advance both trees while allowing to go
5366          *         deeper and repeat
5367          * If tree levels are different
5368          *   Advance the tree that needs it and repeat
5369          *
5370          * Advancing a tree means:
5371          *   If we are at level 0, try to go to the next slot. If that's not
5372          *   possible, go one level up and repeat. Stop when we found a level
5373          *   where we could go to the next slot. We may at this point be on a
5374          *   node or a leaf.
5375          *
5376          *   If we are not at level 0 and not on shared tree blocks, go one
5377          *   level deeper.
5378          *
5379          *   If we are not at level 0 and on shared tree blocks, go one slot to
5380          *   the right if possible or go up and right.
5381          */
5382
5383         down_read(&fs_info->commit_root_sem);
5384         left_level = btrfs_header_level(left_root->commit_root);
5385         left_root_level = left_level;
5386         left_path->nodes[left_level] =
5387                         btrfs_clone_extent_buffer(left_root->commit_root);
5388         if (!left_path->nodes[left_level]) {
5389                 up_read(&fs_info->commit_root_sem);
5390                 ret = -ENOMEM;
5391                 goto out;
5392         }
5393         extent_buffer_get(left_path->nodes[left_level]);
5394
5395         right_level = btrfs_header_level(right_root->commit_root);
5396         right_root_level = right_level;
5397         right_path->nodes[right_level] =
5398                         btrfs_clone_extent_buffer(right_root->commit_root);
5399         if (!right_path->nodes[right_level]) {
5400                 up_read(&fs_info->commit_root_sem);
5401                 ret = -ENOMEM;
5402                 goto out;
5403         }
5404         extent_buffer_get(right_path->nodes[right_level]);
5405         up_read(&fs_info->commit_root_sem);
5406
5407         if (left_level == 0)
5408                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5409                                 &left_key, left_path->slots[left_level]);
5410         else
5411                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5412                                 &left_key, left_path->slots[left_level]);
5413         if (right_level == 0)
5414                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5415                                 &right_key, right_path->slots[right_level]);
5416         else
5417                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5418                                 &right_key, right_path->slots[right_level]);
5419
5420         left_end_reached = right_end_reached = 0;
5421         advance_left = advance_right = 0;
5422
5423         while (1) {
5424                 if (advance_left && !left_end_reached) {
5425                         ret = tree_advance(fs_info, left_path, &left_level,
5426                                         left_root_level,
5427                                         advance_left != ADVANCE_ONLY_NEXT,
5428                                         &left_key);
5429                         if (ret == -1)
5430                                 left_end_reached = ADVANCE;
5431                         else if (ret < 0)
5432                                 goto out;
5433                         advance_left = 0;
5434                 }
5435                 if (advance_right && !right_end_reached) {
5436                         ret = tree_advance(fs_info, right_path, &right_level,
5437                                         right_root_level,
5438                                         advance_right != ADVANCE_ONLY_NEXT,
5439                                         &right_key);
5440                         if (ret == -1)
5441                                 right_end_reached = ADVANCE;
5442                         else if (ret < 0)
5443                                 goto out;
5444                         advance_right = 0;
5445                 }
5446
5447                 if (left_end_reached && right_end_reached) {
5448                         ret = 0;
5449                         goto out;
5450                 } else if (left_end_reached) {
5451                         if (right_level == 0) {
5452                                 ret = changed_cb(left_path, right_path,
5453                                                 &right_key,
5454                                                 BTRFS_COMPARE_TREE_DELETED,
5455                                                 ctx);
5456                                 if (ret < 0)
5457                                         goto out;
5458                         }
5459                         advance_right = ADVANCE;
5460                         continue;
5461                 } else if (right_end_reached) {
5462                         if (left_level == 0) {
5463                                 ret = changed_cb(left_path, right_path,
5464                                                 &left_key,
5465                                                 BTRFS_COMPARE_TREE_NEW,
5466                                                 ctx);
5467                                 if (ret < 0)
5468                                         goto out;
5469                         }
5470                         advance_left = ADVANCE;
5471                         continue;
5472                 }
5473
5474                 if (left_level == 0 && right_level == 0) {
5475                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5476                         if (cmp < 0) {
5477                                 ret = changed_cb(left_path, right_path,
5478                                                 &left_key,
5479                                                 BTRFS_COMPARE_TREE_NEW,
5480                                                 ctx);
5481                                 if (ret < 0)
5482                                         goto out;
5483                                 advance_left = ADVANCE;
5484                         } else if (cmp > 0) {
5485                                 ret = changed_cb(left_path, right_path,
5486                                                 &right_key,
5487                                                 BTRFS_COMPARE_TREE_DELETED,
5488                                                 ctx);
5489                                 if (ret < 0)
5490                                         goto out;
5491                                 advance_right = ADVANCE;
5492                         } else {
5493                                 enum btrfs_compare_tree_result result;
5494
5495                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5496                                 ret = tree_compare_item(left_path, right_path,
5497                                                         tmp_buf);
5498                                 if (ret)
5499                                         result = BTRFS_COMPARE_TREE_CHANGED;
5500                                 else
5501                                         result = BTRFS_COMPARE_TREE_SAME;
5502                                 ret = changed_cb(left_path, right_path,
5503                                                  &left_key, result, ctx);
5504                                 if (ret < 0)
5505                                         goto out;
5506                                 advance_left = ADVANCE;
5507                                 advance_right = ADVANCE;
5508                         }
5509                 } else if (left_level == right_level) {
5510                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5511                         if (cmp < 0) {
5512                                 advance_left = ADVANCE;
5513                         } else if (cmp > 0) {
5514                                 advance_right = ADVANCE;
5515                         } else {
5516                                 left_blockptr = btrfs_node_blockptr(
5517                                                 left_path->nodes[left_level],
5518                                                 left_path->slots[left_level]);
5519                                 right_blockptr = btrfs_node_blockptr(
5520                                                 right_path->nodes[right_level],
5521                                                 right_path->slots[right_level]);
5522                                 left_gen = btrfs_node_ptr_generation(
5523                                                 left_path->nodes[left_level],
5524                                                 left_path->slots[left_level]);
5525                                 right_gen = btrfs_node_ptr_generation(
5526                                                 right_path->nodes[right_level],
5527                                                 right_path->slots[right_level]);
5528                                 if (left_blockptr == right_blockptr &&
5529                                     left_gen == right_gen) {
5530                                         /*
5531                                          * As we're on a shared block, don't
5532                                          * allow to go deeper.
5533                                          */
5534                                         advance_left = ADVANCE_ONLY_NEXT;
5535                                         advance_right = ADVANCE_ONLY_NEXT;
5536                                 } else {
5537                                         advance_left = ADVANCE;
5538                                         advance_right = ADVANCE;
5539                                 }
5540                         }
5541                 } else if (left_level < right_level) {
5542                         advance_right = ADVANCE;
5543                 } else {
5544                         advance_left = ADVANCE;
5545                 }
5546         }
5547
5548 out:
5549         btrfs_free_path(left_path);
5550         btrfs_free_path(right_path);
5551         kvfree(tmp_buf);
5552         return ret;
5553 }
5554
5555 /*
5556  * this is similar to btrfs_next_leaf, but does not try to preserve
5557  * and fixup the path.  It looks for and returns the next key in the
5558  * tree based on the current path and the min_trans parameters.
5559  *
5560  * 0 is returned if another key is found, < 0 if there are any errors
5561  * and 1 is returned if there are no higher keys in the tree
5562  *
5563  * path->keep_locks should be set to 1 on the search made before
5564  * calling this function.
5565  */
5566 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5567                         struct btrfs_key *key, int level, u64 min_trans)
5568 {
5569         int slot;
5570         struct extent_buffer *c;
5571
5572         WARN_ON(!path->keep_locks);
5573         while (level < BTRFS_MAX_LEVEL) {
5574                 if (!path->nodes[level])
5575                         return 1;
5576
5577                 slot = path->slots[level] + 1;
5578                 c = path->nodes[level];
5579 next:
5580                 if (slot >= btrfs_header_nritems(c)) {
5581                         int ret;
5582                         int orig_lowest;
5583                         struct btrfs_key cur_key;
5584                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5585                             !path->nodes[level + 1])
5586                                 return 1;
5587
5588                         if (path->locks[level + 1]) {
5589                                 level++;
5590                                 continue;
5591                         }
5592
5593                         slot = btrfs_header_nritems(c) - 1;
5594                         if (level == 0)
5595                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5596                         else
5597                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5598
5599                         orig_lowest = path->lowest_level;
5600                         btrfs_release_path(path);
5601                         path->lowest_level = level;
5602                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5603                                                 0, 0);
5604                         path->lowest_level = orig_lowest;
5605                         if (ret < 0)
5606                                 return ret;
5607
5608                         c = path->nodes[level];
5609                         slot = path->slots[level];
5610                         if (ret == 0)
5611                                 slot++;
5612                         goto next;
5613                 }
5614
5615                 if (level == 0)
5616                         btrfs_item_key_to_cpu(c, key, slot);
5617                 else {
5618                         u64 gen = btrfs_node_ptr_generation(c, slot);
5619
5620                         if (gen < min_trans) {
5621                                 slot++;
5622                                 goto next;
5623                         }
5624                         btrfs_node_key_to_cpu(c, key, slot);
5625                 }
5626                 return 0;
5627         }
5628         return 1;
5629 }
5630
5631 /*
5632  * search the tree again to find a leaf with greater keys
5633  * returns 0 if it found something or 1 if there are no greater leaves.
5634  * returns < 0 on io errors.
5635  */
5636 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5637 {
5638         return btrfs_next_old_leaf(root, path, 0);
5639 }
5640
5641 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5642                         u64 time_seq)
5643 {
5644         int slot;
5645         int level;
5646         struct extent_buffer *c;
5647         struct extent_buffer *next;
5648         struct btrfs_key key;
5649         u32 nritems;
5650         int ret;
5651         int old_spinning = path->leave_spinning;
5652         int next_rw_lock = 0;
5653
5654         nritems = btrfs_header_nritems(path->nodes[0]);
5655         if (nritems == 0)
5656                 return 1;
5657
5658         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5659 again:
5660         level = 1;
5661         next = NULL;
5662         next_rw_lock = 0;
5663         btrfs_release_path(path);
5664
5665         path->keep_locks = 1;
5666         path->leave_spinning = 1;
5667
5668         if (time_seq)
5669                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5670         else
5671                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5672         path->keep_locks = 0;
5673
5674         if (ret < 0)
5675                 return ret;
5676
5677         nritems = btrfs_header_nritems(path->nodes[0]);
5678         /*
5679          * by releasing the path above we dropped all our locks.  A balance
5680          * could have added more items next to the key that used to be
5681          * at the very end of the block.  So, check again here and
5682          * advance the path if there are now more items available.
5683          */
5684         if (nritems > 0 && path->slots[0] < nritems - 1) {
5685                 if (ret == 0)
5686                         path->slots[0]++;
5687                 ret = 0;
5688                 goto done;
5689         }
5690         /*
5691          * So the above check misses one case:
5692          * - after releasing the path above, someone has removed the item that
5693          *   used to be at the very end of the block, and balance between leafs
5694          *   gets another one with bigger key.offset to replace it.
5695          *
5696          * This one should be returned as well, or we can get leaf corruption
5697          * later(esp. in __btrfs_drop_extents()).
5698          *
5699          * And a bit more explanation about this check,
5700          * with ret > 0, the key isn't found, the path points to the slot
5701          * where it should be inserted, so the path->slots[0] item must be the
5702          * bigger one.
5703          */
5704         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5705                 ret = 0;
5706                 goto done;
5707         }
5708
5709         while (level < BTRFS_MAX_LEVEL) {
5710                 if (!path->nodes[level]) {
5711                         ret = 1;
5712                         goto done;
5713                 }
5714
5715                 slot = path->slots[level] + 1;
5716                 c = path->nodes[level];
5717                 if (slot >= btrfs_header_nritems(c)) {
5718                         level++;
5719                         if (level == BTRFS_MAX_LEVEL) {
5720                                 ret = 1;
5721                                 goto done;
5722                         }
5723                         continue;
5724                 }
5725
5726                 if (next) {
5727                         btrfs_tree_unlock_rw(next, next_rw_lock);
5728                         free_extent_buffer(next);
5729                 }
5730
5731                 next = c;
5732                 next_rw_lock = path->locks[level];
5733                 ret = read_block_for_search(root, path, &next, level,
5734                                             slot, &key);
5735                 if (ret == -EAGAIN)
5736                         goto again;
5737
5738                 if (ret < 0) {
5739                         btrfs_release_path(path);
5740                         goto done;
5741                 }
5742
5743                 if (!path->skip_locking) {
5744                         ret = btrfs_try_tree_read_lock(next);
5745                         if (!ret && time_seq) {
5746                                 /*
5747                                  * If we don't get the lock, we may be racing
5748                                  * with push_leaf_left, holding that lock while
5749                                  * itself waiting for the leaf we've currently
5750                                  * locked. To solve this situation, we give up
5751                                  * on our lock and cycle.
5752                                  */
5753                                 free_extent_buffer(next);
5754                                 btrfs_release_path(path);
5755                                 cond_resched();
5756                                 goto again;
5757                         }
5758                         if (!ret) {
5759                                 btrfs_set_path_blocking(path);
5760                                 btrfs_tree_read_lock(next);
5761                         }
5762                         next_rw_lock = BTRFS_READ_LOCK;
5763                 }
5764                 break;
5765         }
5766         path->slots[level] = slot;
5767         while (1) {
5768                 level--;
5769                 c = path->nodes[level];
5770                 if (path->locks[level])
5771                         btrfs_tree_unlock_rw(c, path->locks[level]);
5772
5773                 free_extent_buffer(c);
5774                 path->nodes[level] = next;
5775                 path->slots[level] = 0;
5776                 if (!path->skip_locking)
5777                         path->locks[level] = next_rw_lock;
5778                 if (!level)
5779                         break;
5780
5781                 ret = read_block_for_search(root, path, &next, level,
5782                                             0, &key);
5783                 if (ret == -EAGAIN)
5784                         goto again;
5785
5786                 if (ret < 0) {
5787                         btrfs_release_path(path);
5788                         goto done;
5789                 }
5790
5791                 if (!path->skip_locking) {
5792                         ret = btrfs_try_tree_read_lock(next);
5793                         if (!ret) {
5794                                 btrfs_set_path_blocking(path);
5795                                 btrfs_tree_read_lock(next);
5796                         }
5797                         next_rw_lock = BTRFS_READ_LOCK;
5798                 }
5799         }
5800         ret = 0;
5801 done:
5802         unlock_up(path, 0, 1, 0, NULL);
5803         path->leave_spinning = old_spinning;
5804         if (!old_spinning)
5805                 btrfs_set_path_blocking(path);
5806
5807         return ret;
5808 }
5809
5810 /*
5811  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5812  * searching until it gets past min_objectid or finds an item of 'type'
5813  *
5814  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5815  */
5816 int btrfs_previous_item(struct btrfs_root *root,
5817                         struct btrfs_path *path, u64 min_objectid,
5818                         int type)
5819 {
5820         struct btrfs_key found_key;
5821         struct extent_buffer *leaf;
5822         u32 nritems;
5823         int ret;
5824
5825         while (1) {
5826                 if (path->slots[0] == 0) {
5827                         btrfs_set_path_blocking(path);
5828                         ret = btrfs_prev_leaf(root, path);
5829                         if (ret != 0)
5830                                 return ret;
5831                 } else {
5832                         path->slots[0]--;
5833                 }
5834                 leaf = path->nodes[0];
5835                 nritems = btrfs_header_nritems(leaf);
5836                 if (nritems == 0)
5837                         return 1;
5838                 if (path->slots[0] == nritems)
5839                         path->slots[0]--;
5840
5841                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5842                 if (found_key.objectid < min_objectid)
5843                         break;
5844                 if (found_key.type == type)
5845                         return 0;
5846                 if (found_key.objectid == min_objectid &&
5847                     found_key.type < type)
5848                         break;
5849         }
5850         return 1;
5851 }
5852
5853 /*
5854  * search in extent tree to find a previous Metadata/Data extent item with
5855  * min objecitd.
5856  *
5857  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5858  */
5859 int btrfs_previous_extent_item(struct btrfs_root *root,
5860                         struct btrfs_path *path, u64 min_objectid)
5861 {
5862         struct btrfs_key found_key;
5863         struct extent_buffer *leaf;
5864         u32 nritems;
5865         int ret;
5866
5867         while (1) {
5868                 if (path->slots[0] == 0) {
5869                         btrfs_set_path_blocking(path);
5870                         ret = btrfs_prev_leaf(root, path);
5871                         if (ret != 0)
5872                                 return ret;
5873                 } else {
5874                         path->slots[0]--;
5875                 }
5876                 leaf = path->nodes[0];
5877                 nritems = btrfs_header_nritems(leaf);
5878                 if (nritems == 0)
5879                         return 1;
5880                 if (path->slots[0] == nritems)
5881                         path->slots[0]--;
5882
5883                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5884                 if (found_key.objectid < min_objectid)
5885                         break;
5886                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5887                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5888                         return 0;
5889                 if (found_key.objectid == min_objectid &&
5890                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5891                         break;
5892         }
5893         return 1;
5894 }