btrfs: Remove redundant extent_buffer_get in get_old_root
[sfrench/cifs-2.6.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15
16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
17                       *root, struct btrfs_path *path, int level);
18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
19                       const struct btrfs_key *ins_key, struct btrfs_path *path,
20                       int data_size, int extend);
21 static int push_node_left(struct btrfs_trans_handle *trans,
22                           struct btrfs_fs_info *fs_info,
23                           struct extent_buffer *dst,
24                           struct extent_buffer *src, int empty);
25 static int balance_node_right(struct btrfs_trans_handle *trans,
26                               struct btrfs_fs_info *fs_info,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 struct btrfs_path *btrfs_alloc_path(void)
33 {
34         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35 }
36
37 /*
38  * set all locked nodes in the path to blocking locks.  This should
39  * be done before scheduling
40  */
41 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42 {
43         int i;
44         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45                 if (!p->nodes[i] || !p->locks[i])
46                         continue;
47                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48                 if (p->locks[i] == BTRFS_READ_LOCK)
49                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
51                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52         }
53 }
54
55 /* this also releases the path */
56 void btrfs_free_path(struct btrfs_path *p)
57 {
58         if (!p)
59                 return;
60         btrfs_release_path(p);
61         kmem_cache_free(btrfs_path_cachep, p);
62 }
63
64 /*
65  * path release drops references on the extent buffers in the path
66  * and it drops any locks held by this path
67  *
68  * It is safe to call this on paths that no locks or extent buffers held.
69  */
70 noinline void btrfs_release_path(struct btrfs_path *p)
71 {
72         int i;
73
74         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75                 p->slots[i] = 0;
76                 if (!p->nodes[i])
77                         continue;
78                 if (p->locks[i]) {
79                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
80                         p->locks[i] = 0;
81                 }
82                 free_extent_buffer(p->nodes[i]);
83                 p->nodes[i] = NULL;
84         }
85 }
86
87 /*
88  * safely gets a reference on the root node of a tree.  A lock
89  * is not taken, so a concurrent writer may put a different node
90  * at the root of the tree.  See btrfs_lock_root_node for the
91  * looping required.
92  *
93  * The extent buffer returned by this has a reference taken, so
94  * it won't disappear.  It may stop being the root of the tree
95  * at any time because there are no locks held.
96  */
97 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98 {
99         struct extent_buffer *eb;
100
101         while (1) {
102                 rcu_read_lock();
103                 eb = rcu_dereference(root->node);
104
105                 /*
106                  * RCU really hurts here, we could free up the root node because
107                  * it was COWed but we may not get the new root node yet so do
108                  * the inc_not_zero dance and if it doesn't work then
109                  * synchronize_rcu and try again.
110                  */
111                 if (atomic_inc_not_zero(&eb->refs)) {
112                         rcu_read_unlock();
113                         break;
114                 }
115                 rcu_read_unlock();
116                 synchronize_rcu();
117         }
118         return eb;
119 }
120
121 /* loop around taking references on and locking the root node of the
122  * tree until you end up with a lock on the root.  A locked buffer
123  * is returned, with a reference held.
124  */
125 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
126 {
127         struct extent_buffer *eb;
128
129         while (1) {
130                 eb = btrfs_root_node(root);
131                 btrfs_tree_lock(eb);
132                 if (eb == root->node)
133                         break;
134                 btrfs_tree_unlock(eb);
135                 free_extent_buffer(eb);
136         }
137         return eb;
138 }
139
140 /* loop around taking references on and locking the root node of the
141  * tree until you end up with a lock on the root.  A locked buffer
142  * is returned, with a reference held.
143  */
144 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
145 {
146         struct extent_buffer *eb;
147
148         while (1) {
149                 eb = btrfs_root_node(root);
150                 btrfs_tree_read_lock(eb);
151                 if (eb == root->node)
152                         break;
153                 btrfs_tree_read_unlock(eb);
154                 free_extent_buffer(eb);
155         }
156         return eb;
157 }
158
159 /* cowonly root (everything not a reference counted cow subvolume), just get
160  * put onto a simple dirty list.  transaction.c walks this to make sure they
161  * get properly updated on disk.
162  */
163 static void add_root_to_dirty_list(struct btrfs_root *root)
164 {
165         struct btrfs_fs_info *fs_info = root->fs_info;
166
167         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
168             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
169                 return;
170
171         spin_lock(&fs_info->trans_lock);
172         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
173                 /* Want the extent tree to be the last on the list */
174                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
175                         list_move_tail(&root->dirty_list,
176                                        &fs_info->dirty_cowonly_roots);
177                 else
178                         list_move(&root->dirty_list,
179                                   &fs_info->dirty_cowonly_roots);
180         }
181         spin_unlock(&fs_info->trans_lock);
182 }
183
184 /*
185  * used by snapshot creation to make a copy of a root for a tree with
186  * a given objectid.  The buffer with the new root node is returned in
187  * cow_ret, and this func returns zero on success or a negative error code.
188  */
189 int btrfs_copy_root(struct btrfs_trans_handle *trans,
190                       struct btrfs_root *root,
191                       struct extent_buffer *buf,
192                       struct extent_buffer **cow_ret, u64 new_root_objectid)
193 {
194         struct btrfs_fs_info *fs_info = root->fs_info;
195         struct extent_buffer *cow;
196         int ret = 0;
197         int level;
198         struct btrfs_disk_key disk_key;
199
200         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
201                 trans->transid != fs_info->running_transaction->transid);
202         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
203                 trans->transid != root->last_trans);
204
205         level = btrfs_header_level(buf);
206         if (level == 0)
207                 btrfs_item_key(buf, &disk_key, 0);
208         else
209                 btrfs_node_key(buf, &disk_key, 0);
210
211         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
212                         &disk_key, level, buf->start, 0);
213         if (IS_ERR(cow))
214                 return PTR_ERR(cow);
215
216         copy_extent_buffer_full(cow, buf);
217         btrfs_set_header_bytenr(cow, cow->start);
218         btrfs_set_header_generation(cow, trans->transid);
219         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
220         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
221                                      BTRFS_HEADER_FLAG_RELOC);
222         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
223                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
224         else
225                 btrfs_set_header_owner(cow, new_root_objectid);
226
227         write_extent_buffer_fsid(cow, fs_info->fsid);
228
229         WARN_ON(btrfs_header_generation(buf) > trans->transid);
230         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
231                 ret = btrfs_inc_ref(trans, root, cow, 1);
232         else
233                 ret = btrfs_inc_ref(trans, root, cow, 0);
234
235         if (ret)
236                 return ret;
237
238         btrfs_mark_buffer_dirty(cow);
239         *cow_ret = cow;
240         return 0;
241 }
242
243 enum mod_log_op {
244         MOD_LOG_KEY_REPLACE,
245         MOD_LOG_KEY_ADD,
246         MOD_LOG_KEY_REMOVE,
247         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
248         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
249         MOD_LOG_MOVE_KEYS,
250         MOD_LOG_ROOT_REPLACE,
251 };
252
253 struct tree_mod_root {
254         u64 logical;
255         u8 level;
256 };
257
258 struct tree_mod_elem {
259         struct rb_node node;
260         u64 logical;
261         u64 seq;
262         enum mod_log_op op;
263
264         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
265         int slot;
266
267         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
268         u64 generation;
269
270         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
271         struct btrfs_disk_key key;
272         u64 blockptr;
273
274         /* this is used for op == MOD_LOG_MOVE_KEYS */
275         struct {
276                 int dst_slot;
277                 int nr_items;
278         } move;
279
280         /* this is used for op == MOD_LOG_ROOT_REPLACE */
281         struct tree_mod_root old_root;
282 };
283
284 /*
285  * Pull a new tree mod seq number for our operation.
286  */
287 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
288 {
289         return atomic64_inc_return(&fs_info->tree_mod_seq);
290 }
291
292 /*
293  * This adds a new blocker to the tree mod log's blocker list if the @elem
294  * passed does not already have a sequence number set. So when a caller expects
295  * to record tree modifications, it should ensure to set elem->seq to zero
296  * before calling btrfs_get_tree_mod_seq.
297  * Returns a fresh, unused tree log modification sequence number, even if no new
298  * blocker was added.
299  */
300 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
301                            struct seq_list *elem)
302 {
303         write_lock(&fs_info->tree_mod_log_lock);
304         spin_lock(&fs_info->tree_mod_seq_lock);
305         if (!elem->seq) {
306                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
307                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
308         }
309         spin_unlock(&fs_info->tree_mod_seq_lock);
310         write_unlock(&fs_info->tree_mod_log_lock);
311
312         return elem->seq;
313 }
314
315 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
316                             struct seq_list *elem)
317 {
318         struct rb_root *tm_root;
319         struct rb_node *node;
320         struct rb_node *next;
321         struct seq_list *cur_elem;
322         struct tree_mod_elem *tm;
323         u64 min_seq = (u64)-1;
324         u64 seq_putting = elem->seq;
325
326         if (!seq_putting)
327                 return;
328
329         spin_lock(&fs_info->tree_mod_seq_lock);
330         list_del(&elem->list);
331         elem->seq = 0;
332
333         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
334                 if (cur_elem->seq < min_seq) {
335                         if (seq_putting > cur_elem->seq) {
336                                 /*
337                                  * blocker with lower sequence number exists, we
338                                  * cannot remove anything from the log
339                                  */
340                                 spin_unlock(&fs_info->tree_mod_seq_lock);
341                                 return;
342                         }
343                         min_seq = cur_elem->seq;
344                 }
345         }
346         spin_unlock(&fs_info->tree_mod_seq_lock);
347
348         /*
349          * anything that's lower than the lowest existing (read: blocked)
350          * sequence number can be removed from the tree.
351          */
352         write_lock(&fs_info->tree_mod_log_lock);
353         tm_root = &fs_info->tree_mod_log;
354         for (node = rb_first(tm_root); node; node = next) {
355                 next = rb_next(node);
356                 tm = rb_entry(node, struct tree_mod_elem, node);
357                 if (tm->seq > min_seq)
358                         continue;
359                 rb_erase(node, tm_root);
360                 kfree(tm);
361         }
362         write_unlock(&fs_info->tree_mod_log_lock);
363 }
364
365 /*
366  * key order of the log:
367  *       node/leaf start address -> sequence
368  *
369  * The 'start address' is the logical address of the *new* root node
370  * for root replace operations, or the logical address of the affected
371  * block for all other operations.
372  *
373  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
374  */
375 static noinline int
376 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
377 {
378         struct rb_root *tm_root;
379         struct rb_node **new;
380         struct rb_node *parent = NULL;
381         struct tree_mod_elem *cur;
382
383         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
384
385         tm_root = &fs_info->tree_mod_log;
386         new = &tm_root->rb_node;
387         while (*new) {
388                 cur = rb_entry(*new, struct tree_mod_elem, node);
389                 parent = *new;
390                 if (cur->logical < tm->logical)
391                         new = &((*new)->rb_left);
392                 else if (cur->logical > tm->logical)
393                         new = &((*new)->rb_right);
394                 else if (cur->seq < tm->seq)
395                         new = &((*new)->rb_left);
396                 else if (cur->seq > tm->seq)
397                         new = &((*new)->rb_right);
398                 else
399                         return -EEXIST;
400         }
401
402         rb_link_node(&tm->node, parent, new);
403         rb_insert_color(&tm->node, tm_root);
404         return 0;
405 }
406
407 /*
408  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
409  * returns zero with the tree_mod_log_lock acquired. The caller must hold
410  * this until all tree mod log insertions are recorded in the rb tree and then
411  * write unlock fs_info::tree_mod_log_lock.
412  */
413 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
414                                     struct extent_buffer *eb) {
415         smp_mb();
416         if (list_empty(&(fs_info)->tree_mod_seq_list))
417                 return 1;
418         if (eb && btrfs_header_level(eb) == 0)
419                 return 1;
420
421         write_lock(&fs_info->tree_mod_log_lock);
422         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
423                 write_unlock(&fs_info->tree_mod_log_lock);
424                 return 1;
425         }
426
427         return 0;
428 }
429
430 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
431 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
432                                     struct extent_buffer *eb)
433 {
434         smp_mb();
435         if (list_empty(&(fs_info)->tree_mod_seq_list))
436                 return 0;
437         if (eb && btrfs_header_level(eb) == 0)
438                 return 0;
439
440         return 1;
441 }
442
443 static struct tree_mod_elem *
444 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
445                     enum mod_log_op op, gfp_t flags)
446 {
447         struct tree_mod_elem *tm;
448
449         tm = kzalloc(sizeof(*tm), flags);
450         if (!tm)
451                 return NULL;
452
453         tm->logical = eb->start;
454         if (op != MOD_LOG_KEY_ADD) {
455                 btrfs_node_key(eb, &tm->key, slot);
456                 tm->blockptr = btrfs_node_blockptr(eb, slot);
457         }
458         tm->op = op;
459         tm->slot = slot;
460         tm->generation = btrfs_node_ptr_generation(eb, slot);
461         RB_CLEAR_NODE(&tm->node);
462
463         return tm;
464 }
465
466 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
467                 enum mod_log_op op, gfp_t flags)
468 {
469         struct tree_mod_elem *tm;
470         int ret;
471
472         if (!tree_mod_need_log(eb->fs_info, eb))
473                 return 0;
474
475         tm = alloc_tree_mod_elem(eb, slot, op, flags);
476         if (!tm)
477                 return -ENOMEM;
478
479         if (tree_mod_dont_log(eb->fs_info, eb)) {
480                 kfree(tm);
481                 return 0;
482         }
483
484         ret = __tree_mod_log_insert(eb->fs_info, tm);
485         write_unlock(&eb->fs_info->tree_mod_log_lock);
486         if (ret)
487                 kfree(tm);
488
489         return ret;
490 }
491
492 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
493                 int dst_slot, int src_slot, int nr_items)
494 {
495         struct tree_mod_elem *tm = NULL;
496         struct tree_mod_elem **tm_list = NULL;
497         int ret = 0;
498         int i;
499         int locked = 0;
500
501         if (!tree_mod_need_log(eb->fs_info, eb))
502                 return 0;
503
504         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
505         if (!tm_list)
506                 return -ENOMEM;
507
508         tm = kzalloc(sizeof(*tm), GFP_NOFS);
509         if (!tm) {
510                 ret = -ENOMEM;
511                 goto free_tms;
512         }
513
514         tm->logical = eb->start;
515         tm->slot = src_slot;
516         tm->move.dst_slot = dst_slot;
517         tm->move.nr_items = nr_items;
518         tm->op = MOD_LOG_MOVE_KEYS;
519
520         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
521                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
522                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
523                 if (!tm_list[i]) {
524                         ret = -ENOMEM;
525                         goto free_tms;
526                 }
527         }
528
529         if (tree_mod_dont_log(eb->fs_info, eb))
530                 goto free_tms;
531         locked = 1;
532
533         /*
534          * When we override something during the move, we log these removals.
535          * This can only happen when we move towards the beginning of the
536          * buffer, i.e. dst_slot < src_slot.
537          */
538         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
539                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
540                 if (ret)
541                         goto free_tms;
542         }
543
544         ret = __tree_mod_log_insert(eb->fs_info, tm);
545         if (ret)
546                 goto free_tms;
547         write_unlock(&eb->fs_info->tree_mod_log_lock);
548         kfree(tm_list);
549
550         return 0;
551 free_tms:
552         for (i = 0; i < nr_items; i++) {
553                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
554                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
555                 kfree(tm_list[i]);
556         }
557         if (locked)
558                 write_unlock(&eb->fs_info->tree_mod_log_lock);
559         kfree(tm_list);
560         kfree(tm);
561
562         return ret;
563 }
564
565 static inline int
566 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
567                        struct tree_mod_elem **tm_list,
568                        int nritems)
569 {
570         int i, j;
571         int ret;
572
573         for (i = nritems - 1; i >= 0; i--) {
574                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
575                 if (ret) {
576                         for (j = nritems - 1; j > i; j--)
577                                 rb_erase(&tm_list[j]->node,
578                                          &fs_info->tree_mod_log);
579                         return ret;
580                 }
581         }
582
583         return 0;
584 }
585
586 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
587                          struct extent_buffer *new_root, int log_removal)
588 {
589         struct btrfs_fs_info *fs_info = old_root->fs_info;
590         struct tree_mod_elem *tm = NULL;
591         struct tree_mod_elem **tm_list = NULL;
592         int nritems = 0;
593         int ret = 0;
594         int i;
595
596         if (!tree_mod_need_log(fs_info, NULL))
597                 return 0;
598
599         if (log_removal && btrfs_header_level(old_root) > 0) {
600                 nritems = btrfs_header_nritems(old_root);
601                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
602                                   GFP_NOFS);
603                 if (!tm_list) {
604                         ret = -ENOMEM;
605                         goto free_tms;
606                 }
607                 for (i = 0; i < nritems; i++) {
608                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
609                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
610                         if (!tm_list[i]) {
611                                 ret = -ENOMEM;
612                                 goto free_tms;
613                         }
614                 }
615         }
616
617         tm = kzalloc(sizeof(*tm), GFP_NOFS);
618         if (!tm) {
619                 ret = -ENOMEM;
620                 goto free_tms;
621         }
622
623         tm->logical = new_root->start;
624         tm->old_root.logical = old_root->start;
625         tm->old_root.level = btrfs_header_level(old_root);
626         tm->generation = btrfs_header_generation(old_root);
627         tm->op = MOD_LOG_ROOT_REPLACE;
628
629         if (tree_mod_dont_log(fs_info, NULL))
630                 goto free_tms;
631
632         if (tm_list)
633                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
634         if (!ret)
635                 ret = __tree_mod_log_insert(fs_info, tm);
636
637         write_unlock(&fs_info->tree_mod_log_lock);
638         if (ret)
639                 goto free_tms;
640         kfree(tm_list);
641
642         return ret;
643
644 free_tms:
645         if (tm_list) {
646                 for (i = 0; i < nritems; i++)
647                         kfree(tm_list[i]);
648                 kfree(tm_list);
649         }
650         kfree(tm);
651
652         return ret;
653 }
654
655 static struct tree_mod_elem *
656 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
657                       int smallest)
658 {
659         struct rb_root *tm_root;
660         struct rb_node *node;
661         struct tree_mod_elem *cur = NULL;
662         struct tree_mod_elem *found = NULL;
663
664         read_lock(&fs_info->tree_mod_log_lock);
665         tm_root = &fs_info->tree_mod_log;
666         node = tm_root->rb_node;
667         while (node) {
668                 cur = rb_entry(node, struct tree_mod_elem, node);
669                 if (cur->logical < start) {
670                         node = node->rb_left;
671                 } else if (cur->logical > start) {
672                         node = node->rb_right;
673                 } else if (cur->seq < min_seq) {
674                         node = node->rb_left;
675                 } else if (!smallest) {
676                         /* we want the node with the highest seq */
677                         if (found)
678                                 BUG_ON(found->seq > cur->seq);
679                         found = cur;
680                         node = node->rb_left;
681                 } else if (cur->seq > min_seq) {
682                         /* we want the node with the smallest seq */
683                         if (found)
684                                 BUG_ON(found->seq < cur->seq);
685                         found = cur;
686                         node = node->rb_right;
687                 } else {
688                         found = cur;
689                         break;
690                 }
691         }
692         read_unlock(&fs_info->tree_mod_log_lock);
693
694         return found;
695 }
696
697 /*
698  * this returns the element from the log with the smallest time sequence
699  * value that's in the log (the oldest log item). any element with a time
700  * sequence lower than min_seq will be ignored.
701  */
702 static struct tree_mod_elem *
703 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
704                            u64 min_seq)
705 {
706         return __tree_mod_log_search(fs_info, start, min_seq, 1);
707 }
708
709 /*
710  * this returns the element from the log with the largest time sequence
711  * value that's in the log (the most recent log item). any element with
712  * a time sequence lower than min_seq will be ignored.
713  */
714 static struct tree_mod_elem *
715 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
716 {
717         return __tree_mod_log_search(fs_info, start, min_seq, 0);
718 }
719
720 static noinline int
721 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
722                      struct extent_buffer *src, unsigned long dst_offset,
723                      unsigned long src_offset, int nr_items)
724 {
725         int ret = 0;
726         struct tree_mod_elem **tm_list = NULL;
727         struct tree_mod_elem **tm_list_add, **tm_list_rem;
728         int i;
729         int locked = 0;
730
731         if (!tree_mod_need_log(fs_info, NULL))
732                 return 0;
733
734         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
735                 return 0;
736
737         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
738                           GFP_NOFS);
739         if (!tm_list)
740                 return -ENOMEM;
741
742         tm_list_add = tm_list;
743         tm_list_rem = tm_list + nr_items;
744         for (i = 0; i < nr_items; i++) {
745                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
746                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
747                 if (!tm_list_rem[i]) {
748                         ret = -ENOMEM;
749                         goto free_tms;
750                 }
751
752                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
753                     MOD_LOG_KEY_ADD, GFP_NOFS);
754                 if (!tm_list_add[i]) {
755                         ret = -ENOMEM;
756                         goto free_tms;
757                 }
758         }
759
760         if (tree_mod_dont_log(fs_info, NULL))
761                 goto free_tms;
762         locked = 1;
763
764         for (i = 0; i < nr_items; i++) {
765                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
766                 if (ret)
767                         goto free_tms;
768                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
769                 if (ret)
770                         goto free_tms;
771         }
772
773         write_unlock(&fs_info->tree_mod_log_lock);
774         kfree(tm_list);
775
776         return 0;
777
778 free_tms:
779         for (i = 0; i < nr_items * 2; i++) {
780                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
781                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
782                 kfree(tm_list[i]);
783         }
784         if (locked)
785                 write_unlock(&fs_info->tree_mod_log_lock);
786         kfree(tm_list);
787
788         return ret;
789 }
790
791 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
792 {
793         struct tree_mod_elem **tm_list = NULL;
794         int nritems = 0;
795         int i;
796         int ret = 0;
797
798         if (btrfs_header_level(eb) == 0)
799                 return 0;
800
801         if (!tree_mod_need_log(eb->fs_info, NULL))
802                 return 0;
803
804         nritems = btrfs_header_nritems(eb);
805         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
806         if (!tm_list)
807                 return -ENOMEM;
808
809         for (i = 0; i < nritems; i++) {
810                 tm_list[i] = alloc_tree_mod_elem(eb, i,
811                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
812                 if (!tm_list[i]) {
813                         ret = -ENOMEM;
814                         goto free_tms;
815                 }
816         }
817
818         if (tree_mod_dont_log(eb->fs_info, eb))
819                 goto free_tms;
820
821         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
822         write_unlock(&eb->fs_info->tree_mod_log_lock);
823         if (ret)
824                 goto free_tms;
825         kfree(tm_list);
826
827         return 0;
828
829 free_tms:
830         for (i = 0; i < nritems; i++)
831                 kfree(tm_list[i]);
832         kfree(tm_list);
833
834         return ret;
835 }
836
837 /*
838  * check if the tree block can be shared by multiple trees
839  */
840 int btrfs_block_can_be_shared(struct btrfs_root *root,
841                               struct extent_buffer *buf)
842 {
843         /*
844          * Tree blocks not in reference counted trees and tree roots
845          * are never shared. If a block was allocated after the last
846          * snapshot and the block was not allocated by tree relocation,
847          * we know the block is not shared.
848          */
849         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
850             buf != root->node && buf != root->commit_root &&
851             (btrfs_header_generation(buf) <=
852              btrfs_root_last_snapshot(&root->root_item) ||
853              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
854                 return 1;
855
856         return 0;
857 }
858
859 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
860                                        struct btrfs_root *root,
861                                        struct extent_buffer *buf,
862                                        struct extent_buffer *cow,
863                                        int *last_ref)
864 {
865         struct btrfs_fs_info *fs_info = root->fs_info;
866         u64 refs;
867         u64 owner;
868         u64 flags;
869         u64 new_flags = 0;
870         int ret;
871
872         /*
873          * Backrefs update rules:
874          *
875          * Always use full backrefs for extent pointers in tree block
876          * allocated by tree relocation.
877          *
878          * If a shared tree block is no longer referenced by its owner
879          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
880          * use full backrefs for extent pointers in tree block.
881          *
882          * If a tree block is been relocating
883          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
884          * use full backrefs for extent pointers in tree block.
885          * The reason for this is some operations (such as drop tree)
886          * are only allowed for blocks use full backrefs.
887          */
888
889         if (btrfs_block_can_be_shared(root, buf)) {
890                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
891                                                btrfs_header_level(buf), 1,
892                                                &refs, &flags);
893                 if (ret)
894                         return ret;
895                 if (refs == 0) {
896                         ret = -EROFS;
897                         btrfs_handle_fs_error(fs_info, ret, NULL);
898                         return ret;
899                 }
900         } else {
901                 refs = 1;
902                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
903                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
904                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
905                 else
906                         flags = 0;
907         }
908
909         owner = btrfs_header_owner(buf);
910         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
911                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
912
913         if (refs > 1) {
914                 if ((owner == root->root_key.objectid ||
915                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
916                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
917                         ret = btrfs_inc_ref(trans, root, buf, 1);
918                         if (ret)
919                                 return ret;
920
921                         if (root->root_key.objectid ==
922                             BTRFS_TREE_RELOC_OBJECTID) {
923                                 ret = btrfs_dec_ref(trans, root, buf, 0);
924                                 if (ret)
925                                         return ret;
926                                 ret = btrfs_inc_ref(trans, root, cow, 1);
927                                 if (ret)
928                                         return ret;
929                         }
930                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
931                 } else {
932
933                         if (root->root_key.objectid ==
934                             BTRFS_TREE_RELOC_OBJECTID)
935                                 ret = btrfs_inc_ref(trans, root, cow, 1);
936                         else
937                                 ret = btrfs_inc_ref(trans, root, cow, 0);
938                         if (ret)
939                                 return ret;
940                 }
941                 if (new_flags != 0) {
942                         int level = btrfs_header_level(buf);
943
944                         ret = btrfs_set_disk_extent_flags(trans, fs_info,
945                                                           buf->start,
946                                                           buf->len,
947                                                           new_flags, level, 0);
948                         if (ret)
949                                 return ret;
950                 }
951         } else {
952                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
953                         if (root->root_key.objectid ==
954                             BTRFS_TREE_RELOC_OBJECTID)
955                                 ret = btrfs_inc_ref(trans, root, cow, 1);
956                         else
957                                 ret = btrfs_inc_ref(trans, root, cow, 0);
958                         if (ret)
959                                 return ret;
960                         ret = btrfs_dec_ref(trans, root, buf, 1);
961                         if (ret)
962                                 return ret;
963                 }
964                 clean_tree_block(fs_info, buf);
965                 *last_ref = 1;
966         }
967         return 0;
968 }
969
970 /*
971  * does the dirty work in cow of a single block.  The parent block (if
972  * supplied) is updated to point to the new cow copy.  The new buffer is marked
973  * dirty and returned locked.  If you modify the block it needs to be marked
974  * dirty again.
975  *
976  * search_start -- an allocation hint for the new block
977  *
978  * empty_size -- a hint that you plan on doing more cow.  This is the size in
979  * bytes the allocator should try to find free next to the block it returns.
980  * This is just a hint and may be ignored by the allocator.
981  */
982 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
983                              struct btrfs_root *root,
984                              struct extent_buffer *buf,
985                              struct extent_buffer *parent, int parent_slot,
986                              struct extent_buffer **cow_ret,
987                              u64 search_start, u64 empty_size)
988 {
989         struct btrfs_fs_info *fs_info = root->fs_info;
990         struct btrfs_disk_key disk_key;
991         struct extent_buffer *cow;
992         int level, ret;
993         int last_ref = 0;
994         int unlock_orig = 0;
995         u64 parent_start = 0;
996
997         if (*cow_ret == buf)
998                 unlock_orig = 1;
999
1000         btrfs_assert_tree_locked(buf);
1001
1002         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1003                 trans->transid != fs_info->running_transaction->transid);
1004         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1005                 trans->transid != root->last_trans);
1006
1007         level = btrfs_header_level(buf);
1008
1009         if (level == 0)
1010                 btrfs_item_key(buf, &disk_key, 0);
1011         else
1012                 btrfs_node_key(buf, &disk_key, 0);
1013
1014         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1015                 parent_start = parent->start;
1016
1017         /*
1018          * If we are COWing a node/leaf from the extent, chunk or device trees,
1019          * make sure that we do not finish block group creation of pending block
1020          * groups. We do this to avoid a deadlock.
1021          * COWing can result in allocation of a new chunk, and flushing pending
1022          * block groups (btrfs_create_pending_block_groups()) can be triggered
1023          * when finishing allocation of a new chunk. Creation of a pending block
1024          * group modifies the extent, chunk and device trees, therefore we could
1025          * deadlock with ourselves since we are holding a lock on an extent
1026          * buffer that btrfs_create_pending_block_groups() may try to COW later.
1027          */
1028         if (root == fs_info->extent_root ||
1029             root == fs_info->chunk_root ||
1030             root == fs_info->dev_root)
1031                 trans->can_flush_pending_bgs = false;
1032
1033         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1034                         root->root_key.objectid, &disk_key, level,
1035                         search_start, empty_size);
1036         trans->can_flush_pending_bgs = true;
1037         if (IS_ERR(cow))
1038                 return PTR_ERR(cow);
1039
1040         /* cow is set to blocking by btrfs_init_new_buffer */
1041
1042         copy_extent_buffer_full(cow, buf);
1043         btrfs_set_header_bytenr(cow, cow->start);
1044         btrfs_set_header_generation(cow, trans->transid);
1045         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1046         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1047                                      BTRFS_HEADER_FLAG_RELOC);
1048         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1049                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1050         else
1051                 btrfs_set_header_owner(cow, root->root_key.objectid);
1052
1053         write_extent_buffer_fsid(cow, fs_info->fsid);
1054
1055         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1056         if (ret) {
1057                 btrfs_abort_transaction(trans, ret);
1058                 return ret;
1059         }
1060
1061         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1062                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1063                 if (ret) {
1064                         btrfs_abort_transaction(trans, ret);
1065                         return ret;
1066                 }
1067         }
1068
1069         if (buf == root->node) {
1070                 WARN_ON(parent && parent != buf);
1071                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1072                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1073                         parent_start = buf->start;
1074
1075                 extent_buffer_get(cow);
1076                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1077                 BUG_ON(ret < 0);
1078                 rcu_assign_pointer(root->node, cow);
1079
1080                 btrfs_free_tree_block(trans, root, buf, parent_start,
1081                                       last_ref);
1082                 free_extent_buffer(buf);
1083                 add_root_to_dirty_list(root);
1084         } else {
1085                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1086                 tree_mod_log_insert_key(parent, parent_slot,
1087                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1088                 btrfs_set_node_blockptr(parent, parent_slot,
1089                                         cow->start);
1090                 btrfs_set_node_ptr_generation(parent, parent_slot,
1091                                               trans->transid);
1092                 btrfs_mark_buffer_dirty(parent);
1093                 if (last_ref) {
1094                         ret = tree_mod_log_free_eb(buf);
1095                         if (ret) {
1096                                 btrfs_abort_transaction(trans, ret);
1097                                 return ret;
1098                         }
1099                 }
1100                 btrfs_free_tree_block(trans, root, buf, parent_start,
1101                                       last_ref);
1102         }
1103         if (unlock_orig)
1104                 btrfs_tree_unlock(buf);
1105         free_extent_buffer_stale(buf);
1106         btrfs_mark_buffer_dirty(cow);
1107         *cow_ret = cow;
1108         return 0;
1109 }
1110
1111 /*
1112  * returns the logical address of the oldest predecessor of the given root.
1113  * entries older than time_seq are ignored.
1114  */
1115 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1116                 struct extent_buffer *eb_root, u64 time_seq)
1117 {
1118         struct tree_mod_elem *tm;
1119         struct tree_mod_elem *found = NULL;
1120         u64 root_logical = eb_root->start;
1121         int looped = 0;
1122
1123         if (!time_seq)
1124                 return NULL;
1125
1126         /*
1127          * the very last operation that's logged for a root is the
1128          * replacement operation (if it is replaced at all). this has
1129          * the logical address of the *new* root, making it the very
1130          * first operation that's logged for this root.
1131          */
1132         while (1) {
1133                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1134                                                 time_seq);
1135                 if (!looped && !tm)
1136                         return NULL;
1137                 /*
1138                  * if there are no tree operation for the oldest root, we simply
1139                  * return it. this should only happen if that (old) root is at
1140                  * level 0.
1141                  */
1142                 if (!tm)
1143                         break;
1144
1145                 /*
1146                  * if there's an operation that's not a root replacement, we
1147                  * found the oldest version of our root. normally, we'll find a
1148                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1149                  */
1150                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1151                         break;
1152
1153                 found = tm;
1154                 root_logical = tm->old_root.logical;
1155                 looped = 1;
1156         }
1157
1158         /* if there's no old root to return, return what we found instead */
1159         if (!found)
1160                 found = tm;
1161
1162         return found;
1163 }
1164
1165 /*
1166  * tm is a pointer to the first operation to rewind within eb. then, all
1167  * previous operations will be rewound (until we reach something older than
1168  * time_seq).
1169  */
1170 static void
1171 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1172                       u64 time_seq, struct tree_mod_elem *first_tm)
1173 {
1174         u32 n;
1175         struct rb_node *next;
1176         struct tree_mod_elem *tm = first_tm;
1177         unsigned long o_dst;
1178         unsigned long o_src;
1179         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1180
1181         n = btrfs_header_nritems(eb);
1182         read_lock(&fs_info->tree_mod_log_lock);
1183         while (tm && tm->seq >= time_seq) {
1184                 /*
1185                  * all the operations are recorded with the operator used for
1186                  * the modification. as we're going backwards, we do the
1187                  * opposite of each operation here.
1188                  */
1189                 switch (tm->op) {
1190                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1191                         BUG_ON(tm->slot < n);
1192                         /* Fallthrough */
1193                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1194                 case MOD_LOG_KEY_REMOVE:
1195                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1196                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1197                         btrfs_set_node_ptr_generation(eb, tm->slot,
1198                                                       tm->generation);
1199                         n++;
1200                         break;
1201                 case MOD_LOG_KEY_REPLACE:
1202                         BUG_ON(tm->slot >= n);
1203                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1204                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1205                         btrfs_set_node_ptr_generation(eb, tm->slot,
1206                                                       tm->generation);
1207                         break;
1208                 case MOD_LOG_KEY_ADD:
1209                         /* if a move operation is needed it's in the log */
1210                         n--;
1211                         break;
1212                 case MOD_LOG_MOVE_KEYS:
1213                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1214                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1215                         memmove_extent_buffer(eb, o_dst, o_src,
1216                                               tm->move.nr_items * p_size);
1217                         break;
1218                 case MOD_LOG_ROOT_REPLACE:
1219                         /*
1220                          * this operation is special. for roots, this must be
1221                          * handled explicitly before rewinding.
1222                          * for non-roots, this operation may exist if the node
1223                          * was a root: root A -> child B; then A gets empty and
1224                          * B is promoted to the new root. in the mod log, we'll
1225                          * have a root-replace operation for B, a tree block
1226                          * that is no root. we simply ignore that operation.
1227                          */
1228                         break;
1229                 }
1230                 next = rb_next(&tm->node);
1231                 if (!next)
1232                         break;
1233                 tm = rb_entry(next, struct tree_mod_elem, node);
1234                 if (tm->logical != first_tm->logical)
1235                         break;
1236         }
1237         read_unlock(&fs_info->tree_mod_log_lock);
1238         btrfs_set_header_nritems(eb, n);
1239 }
1240
1241 /*
1242  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1243  * is returned. If rewind operations happen, a fresh buffer is returned. The
1244  * returned buffer is always read-locked. If the returned buffer is not the
1245  * input buffer, the lock on the input buffer is released and the input buffer
1246  * is freed (its refcount is decremented).
1247  */
1248 static struct extent_buffer *
1249 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1250                     struct extent_buffer *eb, u64 time_seq)
1251 {
1252         struct extent_buffer *eb_rewin;
1253         struct tree_mod_elem *tm;
1254
1255         if (!time_seq)
1256                 return eb;
1257
1258         if (btrfs_header_level(eb) == 0)
1259                 return eb;
1260
1261         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1262         if (!tm)
1263                 return eb;
1264
1265         btrfs_set_path_blocking(path);
1266         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1267
1268         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1269                 BUG_ON(tm->slot != 0);
1270                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1271                 if (!eb_rewin) {
1272                         btrfs_tree_read_unlock_blocking(eb);
1273                         free_extent_buffer(eb);
1274                         return NULL;
1275                 }
1276                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1277                 btrfs_set_header_backref_rev(eb_rewin,
1278                                              btrfs_header_backref_rev(eb));
1279                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1280                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1281         } else {
1282                 eb_rewin = btrfs_clone_extent_buffer(eb);
1283                 if (!eb_rewin) {
1284                         btrfs_tree_read_unlock_blocking(eb);
1285                         free_extent_buffer(eb);
1286                         return NULL;
1287                 }
1288         }
1289
1290         btrfs_tree_read_unlock_blocking(eb);
1291         free_extent_buffer(eb);
1292
1293         extent_buffer_get(eb_rewin);
1294         btrfs_tree_read_lock(eb_rewin);
1295         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1296         WARN_ON(btrfs_header_nritems(eb_rewin) >
1297                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1298
1299         return eb_rewin;
1300 }
1301
1302 /*
1303  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1304  * value. If there are no changes, the current root->root_node is returned. If
1305  * anything changed in between, there's a fresh buffer allocated on which the
1306  * rewind operations are done. In any case, the returned buffer is read locked.
1307  * Returns NULL on error (with no locks held).
1308  */
1309 static inline struct extent_buffer *
1310 get_old_root(struct btrfs_root *root, u64 time_seq)
1311 {
1312         struct btrfs_fs_info *fs_info = root->fs_info;
1313         struct tree_mod_elem *tm;
1314         struct extent_buffer *eb = NULL;
1315         struct extent_buffer *eb_root;
1316         struct extent_buffer *old;
1317         struct tree_mod_root *old_root = NULL;
1318         u64 old_generation = 0;
1319         u64 logical;
1320         int level;
1321
1322         eb_root = btrfs_read_lock_root_node(root);
1323         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1324         if (!tm)
1325                 return eb_root;
1326
1327         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1328                 old_root = &tm->old_root;
1329                 old_generation = tm->generation;
1330                 logical = old_root->logical;
1331                 level = old_root->level;
1332         } else {
1333                 logical = eb_root->start;
1334                 level = btrfs_header_level(eb_root);
1335         }
1336
1337         tm = tree_mod_log_search(fs_info, logical, time_seq);
1338         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1339                 btrfs_tree_read_unlock(eb_root);
1340                 free_extent_buffer(eb_root);
1341                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1342                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1343                         if (!IS_ERR(old))
1344                                 free_extent_buffer(old);
1345                         btrfs_warn(fs_info,
1346                                    "failed to read tree block %llu from get_old_root",
1347                                    logical);
1348                 } else {
1349                         eb = btrfs_clone_extent_buffer(old);
1350                         free_extent_buffer(old);
1351                 }
1352         } else if (old_root) {
1353                 btrfs_tree_read_unlock(eb_root);
1354                 free_extent_buffer(eb_root);
1355                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1356         } else {
1357                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1358                 eb = btrfs_clone_extent_buffer(eb_root);
1359                 btrfs_tree_read_unlock_blocking(eb_root);
1360                 free_extent_buffer(eb_root);
1361         }
1362
1363         if (!eb)
1364                 return NULL;
1365         btrfs_tree_read_lock(eb);
1366         if (old_root) {
1367                 btrfs_set_header_bytenr(eb, eb->start);
1368                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1369                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1370                 btrfs_set_header_level(eb, old_root->level);
1371                 btrfs_set_header_generation(eb, old_generation);
1372         }
1373         if (tm)
1374                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1375         else
1376                 WARN_ON(btrfs_header_level(eb) != 0);
1377         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1378
1379         return eb;
1380 }
1381
1382 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1383 {
1384         struct tree_mod_elem *tm;
1385         int level;
1386         struct extent_buffer *eb_root = btrfs_root_node(root);
1387
1388         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1389         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1390                 level = tm->old_root.level;
1391         } else {
1392                 level = btrfs_header_level(eb_root);
1393         }
1394         free_extent_buffer(eb_root);
1395
1396         return level;
1397 }
1398
1399 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1400                                    struct btrfs_root *root,
1401                                    struct extent_buffer *buf)
1402 {
1403         if (btrfs_is_testing(root->fs_info))
1404                 return 0;
1405
1406         /* Ensure we can see the FORCE_COW bit */
1407         smp_mb__before_atomic();
1408
1409         /*
1410          * We do not need to cow a block if
1411          * 1) this block is not created or changed in this transaction;
1412          * 2) this block does not belong to TREE_RELOC tree;
1413          * 3) the root is not forced COW.
1414          *
1415          * What is forced COW:
1416          *    when we create snapshot during committing the transaction,
1417          *    after we've finished coping src root, we must COW the shared
1418          *    block to ensure the metadata consistency.
1419          */
1420         if (btrfs_header_generation(buf) == trans->transid &&
1421             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1422             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1423               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1424             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1425                 return 0;
1426         return 1;
1427 }
1428
1429 /*
1430  * cows a single block, see __btrfs_cow_block for the real work.
1431  * This version of it has extra checks so that a block isn't COWed more than
1432  * once per transaction, as long as it hasn't been written yet
1433  */
1434 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1435                     struct btrfs_root *root, struct extent_buffer *buf,
1436                     struct extent_buffer *parent, int parent_slot,
1437                     struct extent_buffer **cow_ret)
1438 {
1439         struct btrfs_fs_info *fs_info = root->fs_info;
1440         u64 search_start;
1441         int ret;
1442
1443         if (trans->transaction != fs_info->running_transaction)
1444                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1445                        trans->transid,
1446                        fs_info->running_transaction->transid);
1447
1448         if (trans->transid != fs_info->generation)
1449                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1450                        trans->transid, fs_info->generation);
1451
1452         if (!should_cow_block(trans, root, buf)) {
1453                 trans->dirty = true;
1454                 *cow_ret = buf;
1455                 return 0;
1456         }
1457
1458         search_start = buf->start & ~((u64)SZ_1G - 1);
1459
1460         if (parent)
1461                 btrfs_set_lock_blocking(parent);
1462         btrfs_set_lock_blocking(buf);
1463
1464         ret = __btrfs_cow_block(trans, root, buf, parent,
1465                                  parent_slot, cow_ret, search_start, 0);
1466
1467         trace_btrfs_cow_block(root, buf, *cow_ret);
1468
1469         return ret;
1470 }
1471
1472 /*
1473  * helper function for defrag to decide if two blocks pointed to by a
1474  * node are actually close by
1475  */
1476 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1477 {
1478         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1479                 return 1;
1480         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1481                 return 1;
1482         return 0;
1483 }
1484
1485 /*
1486  * compare two keys in a memcmp fashion
1487  */
1488 static int comp_keys(const struct btrfs_disk_key *disk,
1489                      const struct btrfs_key *k2)
1490 {
1491         struct btrfs_key k1;
1492
1493         btrfs_disk_key_to_cpu(&k1, disk);
1494
1495         return btrfs_comp_cpu_keys(&k1, k2);
1496 }
1497
1498 /*
1499  * same as comp_keys only with two btrfs_key's
1500  */
1501 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1502 {
1503         if (k1->objectid > k2->objectid)
1504                 return 1;
1505         if (k1->objectid < k2->objectid)
1506                 return -1;
1507         if (k1->type > k2->type)
1508                 return 1;
1509         if (k1->type < k2->type)
1510                 return -1;
1511         if (k1->offset > k2->offset)
1512                 return 1;
1513         if (k1->offset < k2->offset)
1514                 return -1;
1515         return 0;
1516 }
1517
1518 /*
1519  * this is used by the defrag code to go through all the
1520  * leaves pointed to by a node and reallocate them so that
1521  * disk order is close to key order
1522  */
1523 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1524                        struct btrfs_root *root, struct extent_buffer *parent,
1525                        int start_slot, u64 *last_ret,
1526                        struct btrfs_key *progress)
1527 {
1528         struct btrfs_fs_info *fs_info = root->fs_info;
1529         struct extent_buffer *cur;
1530         u64 blocknr;
1531         u64 gen;
1532         u64 search_start = *last_ret;
1533         u64 last_block = 0;
1534         u64 other;
1535         u32 parent_nritems;
1536         int end_slot;
1537         int i;
1538         int err = 0;
1539         int parent_level;
1540         int uptodate;
1541         u32 blocksize;
1542         int progress_passed = 0;
1543         struct btrfs_disk_key disk_key;
1544
1545         parent_level = btrfs_header_level(parent);
1546
1547         WARN_ON(trans->transaction != fs_info->running_transaction);
1548         WARN_ON(trans->transid != fs_info->generation);
1549
1550         parent_nritems = btrfs_header_nritems(parent);
1551         blocksize = fs_info->nodesize;
1552         end_slot = parent_nritems - 1;
1553
1554         if (parent_nritems <= 1)
1555                 return 0;
1556
1557         btrfs_set_lock_blocking(parent);
1558
1559         for (i = start_slot; i <= end_slot; i++) {
1560                 struct btrfs_key first_key;
1561                 int close = 1;
1562
1563                 btrfs_node_key(parent, &disk_key, i);
1564                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1565                         continue;
1566
1567                 progress_passed = 1;
1568                 blocknr = btrfs_node_blockptr(parent, i);
1569                 gen = btrfs_node_ptr_generation(parent, i);
1570                 btrfs_node_key_to_cpu(parent, &first_key, i);
1571                 if (last_block == 0)
1572                         last_block = blocknr;
1573
1574                 if (i > 0) {
1575                         other = btrfs_node_blockptr(parent, i - 1);
1576                         close = close_blocks(blocknr, other, blocksize);
1577                 }
1578                 if (!close && i < end_slot) {
1579                         other = btrfs_node_blockptr(parent, i + 1);
1580                         close = close_blocks(blocknr, other, blocksize);
1581                 }
1582                 if (close) {
1583                         last_block = blocknr;
1584                         continue;
1585                 }
1586
1587                 cur = find_extent_buffer(fs_info, blocknr);
1588                 if (cur)
1589                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1590                 else
1591                         uptodate = 0;
1592                 if (!cur || !uptodate) {
1593                         if (!cur) {
1594                                 cur = read_tree_block(fs_info, blocknr, gen,
1595                                                       parent_level - 1,
1596                                                       &first_key);
1597                                 if (IS_ERR(cur)) {
1598                                         return PTR_ERR(cur);
1599                                 } else if (!extent_buffer_uptodate(cur)) {
1600                                         free_extent_buffer(cur);
1601                                         return -EIO;
1602                                 }
1603                         } else if (!uptodate) {
1604                                 err = btrfs_read_buffer(cur, gen,
1605                                                 parent_level - 1,&first_key);
1606                                 if (err) {
1607                                         free_extent_buffer(cur);
1608                                         return err;
1609                                 }
1610                         }
1611                 }
1612                 if (search_start == 0)
1613                         search_start = last_block;
1614
1615                 btrfs_tree_lock(cur);
1616                 btrfs_set_lock_blocking(cur);
1617                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1618                                         &cur, search_start,
1619                                         min(16 * blocksize,
1620                                             (end_slot - i) * blocksize));
1621                 if (err) {
1622                         btrfs_tree_unlock(cur);
1623                         free_extent_buffer(cur);
1624                         break;
1625                 }
1626                 search_start = cur->start;
1627                 last_block = cur->start;
1628                 *last_ret = search_start;
1629                 btrfs_tree_unlock(cur);
1630                 free_extent_buffer(cur);
1631         }
1632         return err;
1633 }
1634
1635 /*
1636  * search for key in the extent_buffer.  The items start at offset p,
1637  * and they are item_size apart.  There are 'max' items in p.
1638  *
1639  * the slot in the array is returned via slot, and it points to
1640  * the place where you would insert key if it is not found in
1641  * the array.
1642  *
1643  * slot may point to max if the key is bigger than all of the keys
1644  */
1645 static noinline int generic_bin_search(struct extent_buffer *eb,
1646                                        unsigned long p, int item_size,
1647                                        const struct btrfs_key *key,
1648                                        int max, int *slot)
1649 {
1650         int low = 0;
1651         int high = max;
1652         int mid;
1653         int ret;
1654         struct btrfs_disk_key *tmp = NULL;
1655         struct btrfs_disk_key unaligned;
1656         unsigned long offset;
1657         char *kaddr = NULL;
1658         unsigned long map_start = 0;
1659         unsigned long map_len = 0;
1660         int err;
1661
1662         if (low > high) {
1663                 btrfs_err(eb->fs_info,
1664                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1665                           __func__, low, high, eb->start,
1666                           btrfs_header_owner(eb), btrfs_header_level(eb));
1667                 return -EINVAL;
1668         }
1669
1670         while (low < high) {
1671                 mid = (low + high) / 2;
1672                 offset = p + mid * item_size;
1673
1674                 if (!kaddr || offset < map_start ||
1675                     (offset + sizeof(struct btrfs_disk_key)) >
1676                     map_start + map_len) {
1677
1678                         err = map_private_extent_buffer(eb, offset,
1679                                                 sizeof(struct btrfs_disk_key),
1680                                                 &kaddr, &map_start, &map_len);
1681
1682                         if (!err) {
1683                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1684                                                         map_start);
1685                         } else if (err == 1) {
1686                                 read_extent_buffer(eb, &unaligned,
1687                                                    offset, sizeof(unaligned));
1688                                 tmp = &unaligned;
1689                         } else {
1690                                 return err;
1691                         }
1692
1693                 } else {
1694                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1695                                                         map_start);
1696                 }
1697                 ret = comp_keys(tmp, key);
1698
1699                 if (ret < 0)
1700                         low = mid + 1;
1701                 else if (ret > 0)
1702                         high = mid;
1703                 else {
1704                         *slot = mid;
1705                         return 0;
1706                 }
1707         }
1708         *slot = low;
1709         return 1;
1710 }
1711
1712 /*
1713  * simple bin_search frontend that does the right thing for
1714  * leaves vs nodes
1715  */
1716 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1717                      int level, int *slot)
1718 {
1719         if (level == 0)
1720                 return generic_bin_search(eb,
1721                                           offsetof(struct btrfs_leaf, items),
1722                                           sizeof(struct btrfs_item),
1723                                           key, btrfs_header_nritems(eb),
1724                                           slot);
1725         else
1726                 return generic_bin_search(eb,
1727                                           offsetof(struct btrfs_node, ptrs),
1728                                           sizeof(struct btrfs_key_ptr),
1729                                           key, btrfs_header_nritems(eb),
1730                                           slot);
1731 }
1732
1733 static void root_add_used(struct btrfs_root *root, u32 size)
1734 {
1735         spin_lock(&root->accounting_lock);
1736         btrfs_set_root_used(&root->root_item,
1737                             btrfs_root_used(&root->root_item) + size);
1738         spin_unlock(&root->accounting_lock);
1739 }
1740
1741 static void root_sub_used(struct btrfs_root *root, u32 size)
1742 {
1743         spin_lock(&root->accounting_lock);
1744         btrfs_set_root_used(&root->root_item,
1745                             btrfs_root_used(&root->root_item) - size);
1746         spin_unlock(&root->accounting_lock);
1747 }
1748
1749 /* given a node and slot number, this reads the blocks it points to.  The
1750  * extent buffer is returned with a reference taken (but unlocked).
1751  */
1752 static noinline struct extent_buffer *
1753 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1754                int slot)
1755 {
1756         int level = btrfs_header_level(parent);
1757         struct extent_buffer *eb;
1758         struct btrfs_key first_key;
1759
1760         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1761                 return ERR_PTR(-ENOENT);
1762
1763         BUG_ON(level == 0);
1764
1765         btrfs_node_key_to_cpu(parent, &first_key, slot);
1766         eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1767                              btrfs_node_ptr_generation(parent, slot),
1768                              level - 1, &first_key);
1769         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1770                 free_extent_buffer(eb);
1771                 eb = ERR_PTR(-EIO);
1772         }
1773
1774         return eb;
1775 }
1776
1777 /*
1778  * node level balancing, used to make sure nodes are in proper order for
1779  * item deletion.  We balance from the top down, so we have to make sure
1780  * that a deletion won't leave an node completely empty later on.
1781  */
1782 static noinline int balance_level(struct btrfs_trans_handle *trans,
1783                          struct btrfs_root *root,
1784                          struct btrfs_path *path, int level)
1785 {
1786         struct btrfs_fs_info *fs_info = root->fs_info;
1787         struct extent_buffer *right = NULL;
1788         struct extent_buffer *mid;
1789         struct extent_buffer *left = NULL;
1790         struct extent_buffer *parent = NULL;
1791         int ret = 0;
1792         int wret;
1793         int pslot;
1794         int orig_slot = path->slots[level];
1795         u64 orig_ptr;
1796
1797         ASSERT(level > 0);
1798
1799         mid = path->nodes[level];
1800
1801         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1802                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1803         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1804
1805         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1806
1807         if (level < BTRFS_MAX_LEVEL - 1) {
1808                 parent = path->nodes[level + 1];
1809                 pslot = path->slots[level + 1];
1810         }
1811
1812         /*
1813          * deal with the case where there is only one pointer in the root
1814          * by promoting the node below to a root
1815          */
1816         if (!parent) {
1817                 struct extent_buffer *child;
1818
1819                 if (btrfs_header_nritems(mid) != 1)
1820                         return 0;
1821
1822                 /* promote the child to a root */
1823                 child = read_node_slot(fs_info, mid, 0);
1824                 if (IS_ERR(child)) {
1825                         ret = PTR_ERR(child);
1826                         btrfs_handle_fs_error(fs_info, ret, NULL);
1827                         goto enospc;
1828                 }
1829
1830                 btrfs_tree_lock(child);
1831                 btrfs_set_lock_blocking(child);
1832                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1833                 if (ret) {
1834                         btrfs_tree_unlock(child);
1835                         free_extent_buffer(child);
1836                         goto enospc;
1837                 }
1838
1839                 ret = tree_mod_log_insert_root(root->node, child, 1);
1840                 BUG_ON(ret < 0);
1841                 rcu_assign_pointer(root->node, child);
1842
1843                 add_root_to_dirty_list(root);
1844                 btrfs_tree_unlock(child);
1845
1846                 path->locks[level] = 0;
1847                 path->nodes[level] = NULL;
1848                 clean_tree_block(fs_info, mid);
1849                 btrfs_tree_unlock(mid);
1850                 /* once for the path */
1851                 free_extent_buffer(mid);
1852
1853                 root_sub_used(root, mid->len);
1854                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1855                 /* once for the root ptr */
1856                 free_extent_buffer_stale(mid);
1857                 return 0;
1858         }
1859         if (btrfs_header_nritems(mid) >
1860             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1861                 return 0;
1862
1863         left = read_node_slot(fs_info, parent, pslot - 1);
1864         if (IS_ERR(left))
1865                 left = NULL;
1866
1867         if (left) {
1868                 btrfs_tree_lock(left);
1869                 btrfs_set_lock_blocking(left);
1870                 wret = btrfs_cow_block(trans, root, left,
1871                                        parent, pslot - 1, &left);
1872                 if (wret) {
1873                         ret = wret;
1874                         goto enospc;
1875                 }
1876         }
1877
1878         right = read_node_slot(fs_info, parent, pslot + 1);
1879         if (IS_ERR(right))
1880                 right = NULL;
1881
1882         if (right) {
1883                 btrfs_tree_lock(right);
1884                 btrfs_set_lock_blocking(right);
1885                 wret = btrfs_cow_block(trans, root, right,
1886                                        parent, pslot + 1, &right);
1887                 if (wret) {
1888                         ret = wret;
1889                         goto enospc;
1890                 }
1891         }
1892
1893         /* first, try to make some room in the middle buffer */
1894         if (left) {
1895                 orig_slot += btrfs_header_nritems(left);
1896                 wret = push_node_left(trans, fs_info, left, mid, 1);
1897                 if (wret < 0)
1898                         ret = wret;
1899         }
1900
1901         /*
1902          * then try to empty the right most buffer into the middle
1903          */
1904         if (right) {
1905                 wret = push_node_left(trans, fs_info, mid, right, 1);
1906                 if (wret < 0 && wret != -ENOSPC)
1907                         ret = wret;
1908                 if (btrfs_header_nritems(right) == 0) {
1909                         clean_tree_block(fs_info, right);
1910                         btrfs_tree_unlock(right);
1911                         del_ptr(root, path, level + 1, pslot + 1);
1912                         root_sub_used(root, right->len);
1913                         btrfs_free_tree_block(trans, root, right, 0, 1);
1914                         free_extent_buffer_stale(right);
1915                         right = NULL;
1916                 } else {
1917                         struct btrfs_disk_key right_key;
1918                         btrfs_node_key(right, &right_key, 0);
1919                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1920                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1921                         BUG_ON(ret < 0);
1922                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1923                         btrfs_mark_buffer_dirty(parent);
1924                 }
1925         }
1926         if (btrfs_header_nritems(mid) == 1) {
1927                 /*
1928                  * we're not allowed to leave a node with one item in the
1929                  * tree during a delete.  A deletion from lower in the tree
1930                  * could try to delete the only pointer in this node.
1931                  * So, pull some keys from the left.
1932                  * There has to be a left pointer at this point because
1933                  * otherwise we would have pulled some pointers from the
1934                  * right
1935                  */
1936                 if (!left) {
1937                         ret = -EROFS;
1938                         btrfs_handle_fs_error(fs_info, ret, NULL);
1939                         goto enospc;
1940                 }
1941                 wret = balance_node_right(trans, fs_info, mid, left);
1942                 if (wret < 0) {
1943                         ret = wret;
1944                         goto enospc;
1945                 }
1946                 if (wret == 1) {
1947                         wret = push_node_left(trans, fs_info, left, mid, 1);
1948                         if (wret < 0)
1949                                 ret = wret;
1950                 }
1951                 BUG_ON(wret == 1);
1952         }
1953         if (btrfs_header_nritems(mid) == 0) {
1954                 clean_tree_block(fs_info, mid);
1955                 btrfs_tree_unlock(mid);
1956                 del_ptr(root, path, level + 1, pslot);
1957                 root_sub_used(root, mid->len);
1958                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1959                 free_extent_buffer_stale(mid);
1960                 mid = NULL;
1961         } else {
1962                 /* update the parent key to reflect our changes */
1963                 struct btrfs_disk_key mid_key;
1964                 btrfs_node_key(mid, &mid_key, 0);
1965                 ret = tree_mod_log_insert_key(parent, pslot,
1966                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1967                 BUG_ON(ret < 0);
1968                 btrfs_set_node_key(parent, &mid_key, pslot);
1969                 btrfs_mark_buffer_dirty(parent);
1970         }
1971
1972         /* update the path */
1973         if (left) {
1974                 if (btrfs_header_nritems(left) > orig_slot) {
1975                         extent_buffer_get(left);
1976                         /* left was locked after cow */
1977                         path->nodes[level] = left;
1978                         path->slots[level + 1] -= 1;
1979                         path->slots[level] = orig_slot;
1980                         if (mid) {
1981                                 btrfs_tree_unlock(mid);
1982                                 free_extent_buffer(mid);
1983                         }
1984                 } else {
1985                         orig_slot -= btrfs_header_nritems(left);
1986                         path->slots[level] = orig_slot;
1987                 }
1988         }
1989         /* double check we haven't messed things up */
1990         if (orig_ptr !=
1991             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1992                 BUG();
1993 enospc:
1994         if (right) {
1995                 btrfs_tree_unlock(right);
1996                 free_extent_buffer(right);
1997         }
1998         if (left) {
1999                 if (path->nodes[level] != left)
2000                         btrfs_tree_unlock(left);
2001                 free_extent_buffer(left);
2002         }
2003         return ret;
2004 }
2005
2006 /* Node balancing for insertion.  Here we only split or push nodes around
2007  * when they are completely full.  This is also done top down, so we
2008  * have to be pessimistic.
2009  */
2010 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2011                                           struct btrfs_root *root,
2012                                           struct btrfs_path *path, int level)
2013 {
2014         struct btrfs_fs_info *fs_info = root->fs_info;
2015         struct extent_buffer *right = NULL;
2016         struct extent_buffer *mid;
2017         struct extent_buffer *left = NULL;
2018         struct extent_buffer *parent = NULL;
2019         int ret = 0;
2020         int wret;
2021         int pslot;
2022         int orig_slot = path->slots[level];
2023
2024         if (level == 0)
2025                 return 1;
2026
2027         mid = path->nodes[level];
2028         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2029
2030         if (level < BTRFS_MAX_LEVEL - 1) {
2031                 parent = path->nodes[level + 1];
2032                 pslot = path->slots[level + 1];
2033         }
2034
2035         if (!parent)
2036                 return 1;
2037
2038         left = read_node_slot(fs_info, parent, pslot - 1);
2039         if (IS_ERR(left))
2040                 left = NULL;
2041
2042         /* first, try to make some room in the middle buffer */
2043         if (left) {
2044                 u32 left_nr;
2045
2046                 btrfs_tree_lock(left);
2047                 btrfs_set_lock_blocking(left);
2048
2049                 left_nr = btrfs_header_nritems(left);
2050                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2051                         wret = 1;
2052                 } else {
2053                         ret = btrfs_cow_block(trans, root, left, parent,
2054                                               pslot - 1, &left);
2055                         if (ret)
2056                                 wret = 1;
2057                         else {
2058                                 wret = push_node_left(trans, fs_info,
2059                                                       left, mid, 0);
2060                         }
2061                 }
2062                 if (wret < 0)
2063                         ret = wret;
2064                 if (wret == 0) {
2065                         struct btrfs_disk_key disk_key;
2066                         orig_slot += left_nr;
2067                         btrfs_node_key(mid, &disk_key, 0);
2068                         ret = tree_mod_log_insert_key(parent, pslot,
2069                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2070                         BUG_ON(ret < 0);
2071                         btrfs_set_node_key(parent, &disk_key, pslot);
2072                         btrfs_mark_buffer_dirty(parent);
2073                         if (btrfs_header_nritems(left) > orig_slot) {
2074                                 path->nodes[level] = left;
2075                                 path->slots[level + 1] -= 1;
2076                                 path->slots[level] = orig_slot;
2077                                 btrfs_tree_unlock(mid);
2078                                 free_extent_buffer(mid);
2079                         } else {
2080                                 orig_slot -=
2081                                         btrfs_header_nritems(left);
2082                                 path->slots[level] = orig_slot;
2083                                 btrfs_tree_unlock(left);
2084                                 free_extent_buffer(left);
2085                         }
2086                         return 0;
2087                 }
2088                 btrfs_tree_unlock(left);
2089                 free_extent_buffer(left);
2090         }
2091         right = read_node_slot(fs_info, parent, pslot + 1);
2092         if (IS_ERR(right))
2093                 right = NULL;
2094
2095         /*
2096          * then try to empty the right most buffer into the middle
2097          */
2098         if (right) {
2099                 u32 right_nr;
2100
2101                 btrfs_tree_lock(right);
2102                 btrfs_set_lock_blocking(right);
2103
2104                 right_nr = btrfs_header_nritems(right);
2105                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2106                         wret = 1;
2107                 } else {
2108                         ret = btrfs_cow_block(trans, root, right,
2109                                               parent, pslot + 1,
2110                                               &right);
2111                         if (ret)
2112                                 wret = 1;
2113                         else {
2114                                 wret = balance_node_right(trans, fs_info,
2115                                                           right, mid);
2116                         }
2117                 }
2118                 if (wret < 0)
2119                         ret = wret;
2120                 if (wret == 0) {
2121                         struct btrfs_disk_key disk_key;
2122
2123                         btrfs_node_key(right, &disk_key, 0);
2124                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2125                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2126                         BUG_ON(ret < 0);
2127                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2128                         btrfs_mark_buffer_dirty(parent);
2129
2130                         if (btrfs_header_nritems(mid) <= orig_slot) {
2131                                 path->nodes[level] = right;
2132                                 path->slots[level + 1] += 1;
2133                                 path->slots[level] = orig_slot -
2134                                         btrfs_header_nritems(mid);
2135                                 btrfs_tree_unlock(mid);
2136                                 free_extent_buffer(mid);
2137                         } else {
2138                                 btrfs_tree_unlock(right);
2139                                 free_extent_buffer(right);
2140                         }
2141                         return 0;
2142                 }
2143                 btrfs_tree_unlock(right);
2144                 free_extent_buffer(right);
2145         }
2146         return 1;
2147 }
2148
2149 /*
2150  * readahead one full node of leaves, finding things that are close
2151  * to the block in 'slot', and triggering ra on them.
2152  */
2153 static void reada_for_search(struct btrfs_fs_info *fs_info,
2154                              struct btrfs_path *path,
2155                              int level, int slot, u64 objectid)
2156 {
2157         struct extent_buffer *node;
2158         struct btrfs_disk_key disk_key;
2159         u32 nritems;
2160         u64 search;
2161         u64 target;
2162         u64 nread = 0;
2163         struct extent_buffer *eb;
2164         u32 nr;
2165         u32 blocksize;
2166         u32 nscan = 0;
2167
2168         if (level != 1)
2169                 return;
2170
2171         if (!path->nodes[level])
2172                 return;
2173
2174         node = path->nodes[level];
2175
2176         search = btrfs_node_blockptr(node, slot);
2177         blocksize = fs_info->nodesize;
2178         eb = find_extent_buffer(fs_info, search);
2179         if (eb) {
2180                 free_extent_buffer(eb);
2181                 return;
2182         }
2183
2184         target = search;
2185
2186         nritems = btrfs_header_nritems(node);
2187         nr = slot;
2188
2189         while (1) {
2190                 if (path->reada == READA_BACK) {
2191                         if (nr == 0)
2192                                 break;
2193                         nr--;
2194                 } else if (path->reada == READA_FORWARD) {
2195                         nr++;
2196                         if (nr >= nritems)
2197                                 break;
2198                 }
2199                 if (path->reada == READA_BACK && objectid) {
2200                         btrfs_node_key(node, &disk_key, nr);
2201                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2202                                 break;
2203                 }
2204                 search = btrfs_node_blockptr(node, nr);
2205                 if ((search <= target && target - search <= 65536) ||
2206                     (search > target && search - target <= 65536)) {
2207                         readahead_tree_block(fs_info, search);
2208                         nread += blocksize;
2209                 }
2210                 nscan++;
2211                 if ((nread > 65536 || nscan > 32))
2212                         break;
2213         }
2214 }
2215
2216 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2217                                        struct btrfs_path *path, int level)
2218 {
2219         int slot;
2220         int nritems;
2221         struct extent_buffer *parent;
2222         struct extent_buffer *eb;
2223         u64 gen;
2224         u64 block1 = 0;
2225         u64 block2 = 0;
2226
2227         parent = path->nodes[level + 1];
2228         if (!parent)
2229                 return;
2230
2231         nritems = btrfs_header_nritems(parent);
2232         slot = path->slots[level + 1];
2233
2234         if (slot > 0) {
2235                 block1 = btrfs_node_blockptr(parent, slot - 1);
2236                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2237                 eb = find_extent_buffer(fs_info, block1);
2238                 /*
2239                  * if we get -eagain from btrfs_buffer_uptodate, we
2240                  * don't want to return eagain here.  That will loop
2241                  * forever
2242                  */
2243                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2244                         block1 = 0;
2245                 free_extent_buffer(eb);
2246         }
2247         if (slot + 1 < nritems) {
2248                 block2 = btrfs_node_blockptr(parent, slot + 1);
2249                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2250                 eb = find_extent_buffer(fs_info, block2);
2251                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2252                         block2 = 0;
2253                 free_extent_buffer(eb);
2254         }
2255
2256         if (block1)
2257                 readahead_tree_block(fs_info, block1);
2258         if (block2)
2259                 readahead_tree_block(fs_info, block2);
2260 }
2261
2262
2263 /*
2264  * when we walk down the tree, it is usually safe to unlock the higher layers
2265  * in the tree.  The exceptions are when our path goes through slot 0, because
2266  * operations on the tree might require changing key pointers higher up in the
2267  * tree.
2268  *
2269  * callers might also have set path->keep_locks, which tells this code to keep
2270  * the lock if the path points to the last slot in the block.  This is part of
2271  * walking through the tree, and selecting the next slot in the higher block.
2272  *
2273  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2274  * if lowest_unlock is 1, level 0 won't be unlocked
2275  */
2276 static noinline void unlock_up(struct btrfs_path *path, int level,
2277                                int lowest_unlock, int min_write_lock_level,
2278                                int *write_lock_level)
2279 {
2280         int i;
2281         int skip_level = level;
2282         int no_skips = 0;
2283         struct extent_buffer *t;
2284
2285         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2286                 if (!path->nodes[i])
2287                         break;
2288                 if (!path->locks[i])
2289                         break;
2290                 if (!no_skips && path->slots[i] == 0) {
2291                         skip_level = i + 1;
2292                         continue;
2293                 }
2294                 if (!no_skips && path->keep_locks) {
2295                         u32 nritems;
2296                         t = path->nodes[i];
2297                         nritems = btrfs_header_nritems(t);
2298                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2299                                 skip_level = i + 1;
2300                                 continue;
2301                         }
2302                 }
2303                 if (skip_level < i && i >= lowest_unlock)
2304                         no_skips = 1;
2305
2306                 t = path->nodes[i];
2307                 if (i >= lowest_unlock && i > skip_level) {
2308                         btrfs_tree_unlock_rw(t, path->locks[i]);
2309                         path->locks[i] = 0;
2310                         if (write_lock_level &&
2311                             i > min_write_lock_level &&
2312                             i <= *write_lock_level) {
2313                                 *write_lock_level = i - 1;
2314                         }
2315                 }
2316         }
2317 }
2318
2319 /*
2320  * This releases any locks held in the path starting at level and
2321  * going all the way up to the root.
2322  *
2323  * btrfs_search_slot will keep the lock held on higher nodes in a few
2324  * corner cases, such as COW of the block at slot zero in the node.  This
2325  * ignores those rules, and it should only be called when there are no
2326  * more updates to be done higher up in the tree.
2327  */
2328 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2329 {
2330         int i;
2331
2332         if (path->keep_locks)
2333                 return;
2334
2335         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2336                 if (!path->nodes[i])
2337                         continue;
2338                 if (!path->locks[i])
2339                         continue;
2340                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2341                 path->locks[i] = 0;
2342         }
2343 }
2344
2345 /*
2346  * helper function for btrfs_search_slot.  The goal is to find a block
2347  * in cache without setting the path to blocking.  If we find the block
2348  * we return zero and the path is unchanged.
2349  *
2350  * If we can't find the block, we set the path blocking and do some
2351  * reada.  -EAGAIN is returned and the search must be repeated.
2352  */
2353 static int
2354 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2355                       struct extent_buffer **eb_ret, int level, int slot,
2356                       const struct btrfs_key *key)
2357 {
2358         struct btrfs_fs_info *fs_info = root->fs_info;
2359         u64 blocknr;
2360         u64 gen;
2361         struct extent_buffer *b = *eb_ret;
2362         struct extent_buffer *tmp;
2363         struct btrfs_key first_key;
2364         int ret;
2365         int parent_level;
2366
2367         blocknr = btrfs_node_blockptr(b, slot);
2368         gen = btrfs_node_ptr_generation(b, slot);
2369         parent_level = btrfs_header_level(b);
2370         btrfs_node_key_to_cpu(b, &first_key, slot);
2371
2372         tmp = find_extent_buffer(fs_info, blocknr);
2373         if (tmp) {
2374                 /* first we do an atomic uptodate check */
2375                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2376                         *eb_ret = tmp;
2377                         return 0;
2378                 }
2379
2380                 /* the pages were up to date, but we failed
2381                  * the generation number check.  Do a full
2382                  * read for the generation number that is correct.
2383                  * We must do this without dropping locks so
2384                  * we can trust our generation number
2385                  */
2386                 btrfs_set_path_blocking(p);
2387
2388                 /* now we're allowed to do a blocking uptodate check */
2389                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2390                 if (!ret) {
2391                         *eb_ret = tmp;
2392                         return 0;
2393                 }
2394                 free_extent_buffer(tmp);
2395                 btrfs_release_path(p);
2396                 return -EIO;
2397         }
2398
2399         /*
2400          * reduce lock contention at high levels
2401          * of the btree by dropping locks before
2402          * we read.  Don't release the lock on the current
2403          * level because we need to walk this node to figure
2404          * out which blocks to read.
2405          */
2406         btrfs_unlock_up_safe(p, level + 1);
2407         btrfs_set_path_blocking(p);
2408
2409         if (p->reada != READA_NONE)
2410                 reada_for_search(fs_info, p, level, slot, key->objectid);
2411
2412         ret = -EAGAIN;
2413         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2414                               &first_key);
2415         if (!IS_ERR(tmp)) {
2416                 /*
2417                  * If the read above didn't mark this buffer up to date,
2418                  * it will never end up being up to date.  Set ret to EIO now
2419                  * and give up so that our caller doesn't loop forever
2420                  * on our EAGAINs.
2421                  */
2422                 if (!extent_buffer_uptodate(tmp))
2423                         ret = -EIO;
2424                 free_extent_buffer(tmp);
2425         } else {
2426                 ret = PTR_ERR(tmp);
2427         }
2428
2429         btrfs_release_path(p);
2430         return ret;
2431 }
2432
2433 /*
2434  * helper function for btrfs_search_slot.  This does all of the checks
2435  * for node-level blocks and does any balancing required based on
2436  * the ins_len.
2437  *
2438  * If no extra work was required, zero is returned.  If we had to
2439  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2440  * start over
2441  */
2442 static int
2443 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2444                        struct btrfs_root *root, struct btrfs_path *p,
2445                        struct extent_buffer *b, int level, int ins_len,
2446                        int *write_lock_level)
2447 {
2448         struct btrfs_fs_info *fs_info = root->fs_info;
2449         int ret;
2450
2451         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2452             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2453                 int sret;
2454
2455                 if (*write_lock_level < level + 1) {
2456                         *write_lock_level = level + 1;
2457                         btrfs_release_path(p);
2458                         goto again;
2459                 }
2460
2461                 btrfs_set_path_blocking(p);
2462                 reada_for_balance(fs_info, p, level);
2463                 sret = split_node(trans, root, p, level);
2464
2465                 BUG_ON(sret > 0);
2466                 if (sret) {
2467                         ret = sret;
2468                         goto done;
2469                 }
2470                 b = p->nodes[level];
2471         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2472                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2473                 int sret;
2474
2475                 if (*write_lock_level < level + 1) {
2476                         *write_lock_level = level + 1;
2477                         btrfs_release_path(p);
2478                         goto again;
2479                 }
2480
2481                 btrfs_set_path_blocking(p);
2482                 reada_for_balance(fs_info, p, level);
2483                 sret = balance_level(trans, root, p, level);
2484
2485                 if (sret) {
2486                         ret = sret;
2487                         goto done;
2488                 }
2489                 b = p->nodes[level];
2490                 if (!b) {
2491                         btrfs_release_path(p);
2492                         goto again;
2493                 }
2494                 BUG_ON(btrfs_header_nritems(b) == 1);
2495         }
2496         return 0;
2497
2498 again:
2499         ret = -EAGAIN;
2500 done:
2501         return ret;
2502 }
2503
2504 static void key_search_validate(struct extent_buffer *b,
2505                                 const struct btrfs_key *key,
2506                                 int level)
2507 {
2508 #ifdef CONFIG_BTRFS_ASSERT
2509         struct btrfs_disk_key disk_key;
2510
2511         btrfs_cpu_key_to_disk(&disk_key, key);
2512
2513         if (level == 0)
2514                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2515                     offsetof(struct btrfs_leaf, items[0].key),
2516                     sizeof(disk_key)));
2517         else
2518                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2519                     offsetof(struct btrfs_node, ptrs[0].key),
2520                     sizeof(disk_key)));
2521 #endif
2522 }
2523
2524 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2525                       int level, int *prev_cmp, int *slot)
2526 {
2527         if (*prev_cmp != 0) {
2528                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2529                 return *prev_cmp;
2530         }
2531
2532         key_search_validate(b, key, level);
2533         *slot = 0;
2534
2535         return 0;
2536 }
2537
2538 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2539                 u64 iobjectid, u64 ioff, u8 key_type,
2540                 struct btrfs_key *found_key)
2541 {
2542         int ret;
2543         struct btrfs_key key;
2544         struct extent_buffer *eb;
2545
2546         ASSERT(path);
2547         ASSERT(found_key);
2548
2549         key.type = key_type;
2550         key.objectid = iobjectid;
2551         key.offset = ioff;
2552
2553         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2554         if (ret < 0)
2555                 return ret;
2556
2557         eb = path->nodes[0];
2558         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2559                 ret = btrfs_next_leaf(fs_root, path);
2560                 if (ret)
2561                         return ret;
2562                 eb = path->nodes[0];
2563         }
2564
2565         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2566         if (found_key->type != key.type ||
2567                         found_key->objectid != key.objectid)
2568                 return 1;
2569
2570         return 0;
2571 }
2572
2573 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2574                                                         struct btrfs_path *p,
2575                                                         int write_lock_level)
2576 {
2577         struct btrfs_fs_info *fs_info = root->fs_info;
2578         struct extent_buffer *b;
2579         int root_lock;
2580         int level = 0;
2581
2582         /* We try very hard to do read locks on the root */
2583         root_lock = BTRFS_READ_LOCK;
2584
2585         if (p->search_commit_root) {
2586                 /* The commit roots are read only so we always do read locks */
2587                 if (p->need_commit_sem)
2588                         down_read(&fs_info->commit_root_sem);
2589                 b = root->commit_root;
2590                 extent_buffer_get(b);
2591                 level = btrfs_header_level(b);
2592                 if (p->need_commit_sem)
2593                         up_read(&fs_info->commit_root_sem);
2594                 /*
2595                  * Ensure that all callers have set skip_locking when
2596                  * p->search_commit_root = 1.
2597                  */
2598                 ASSERT(p->skip_locking == 1);
2599
2600                 goto out;
2601         }
2602
2603         if (p->skip_locking) {
2604                 b = btrfs_root_node(root);
2605                 level = btrfs_header_level(b);
2606                 goto out;
2607         }
2608
2609         /*
2610          * If the level is set to maximum, we can skip trying to get the read
2611          * lock.
2612          */
2613         if (write_lock_level < BTRFS_MAX_LEVEL) {
2614                 /*
2615                  * We don't know the level of the root node until we actually
2616                  * have it read locked
2617                  */
2618                 b = btrfs_read_lock_root_node(root);
2619                 level = btrfs_header_level(b);
2620                 if (level > write_lock_level)
2621                         goto out;
2622
2623                 /* Whoops, must trade for write lock */
2624                 btrfs_tree_read_unlock(b);
2625                 free_extent_buffer(b);
2626         }
2627
2628         b = btrfs_lock_root_node(root);
2629         root_lock = BTRFS_WRITE_LOCK;
2630
2631         /* The level might have changed, check again */
2632         level = btrfs_header_level(b);
2633
2634 out:
2635         p->nodes[level] = b;
2636         if (!p->skip_locking)
2637                 p->locks[level] = root_lock;
2638         /*
2639          * Callers are responsible for dropping b's references.
2640          */
2641         return b;
2642 }
2643
2644
2645 /*
2646  * btrfs_search_slot - look for a key in a tree and perform necessary
2647  * modifications to preserve tree invariants.
2648  *
2649  * @trans:      Handle of transaction, used when modifying the tree
2650  * @p:          Holds all btree nodes along the search path
2651  * @root:       The root node of the tree
2652  * @key:        The key we are looking for
2653  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2654  *              deletions it's -1. 0 for plain searches
2655  * @cow:        boolean should CoW operations be performed. Must always be 1
2656  *              when modifying the tree.
2657  *
2658  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2659  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2660  *
2661  * If @key is found, 0 is returned and you can find the item in the leaf level
2662  * of the path (level 0)
2663  *
2664  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2665  * points to the slot where it should be inserted
2666  *
2667  * If an error is encountered while searching the tree a negative error number
2668  * is returned
2669  */
2670 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2671                       const struct btrfs_key *key, struct btrfs_path *p,
2672                       int ins_len, int cow)
2673 {
2674         struct btrfs_fs_info *fs_info = root->fs_info;
2675         struct extent_buffer *b;
2676         int slot;
2677         int ret;
2678         int err;
2679         int level;
2680         int lowest_unlock = 1;
2681         /* everything at write_lock_level or lower must be write locked */
2682         int write_lock_level = 0;
2683         u8 lowest_level = 0;
2684         int min_write_lock_level;
2685         int prev_cmp;
2686
2687         lowest_level = p->lowest_level;
2688         WARN_ON(lowest_level && ins_len > 0);
2689         WARN_ON(p->nodes[0] != NULL);
2690         BUG_ON(!cow && ins_len);
2691
2692         if (ins_len < 0) {
2693                 lowest_unlock = 2;
2694
2695                 /* when we are removing items, we might have to go up to level
2696                  * two as we update tree pointers  Make sure we keep write
2697                  * for those levels as well
2698                  */
2699                 write_lock_level = 2;
2700         } else if (ins_len > 0) {
2701                 /*
2702                  * for inserting items, make sure we have a write lock on
2703                  * level 1 so we can update keys
2704                  */
2705                 write_lock_level = 1;
2706         }
2707
2708         if (!cow)
2709                 write_lock_level = -1;
2710
2711         if (cow && (p->keep_locks || p->lowest_level))
2712                 write_lock_level = BTRFS_MAX_LEVEL;
2713
2714         min_write_lock_level = write_lock_level;
2715
2716 again:
2717         prev_cmp = -1;
2718         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2719
2720         while (b) {
2721                 level = btrfs_header_level(b);
2722
2723                 /*
2724                  * setup the path here so we can release it under lock
2725                  * contention with the cow code
2726                  */
2727                 if (cow) {
2728                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2729
2730                         /*
2731                          * if we don't really need to cow this block
2732                          * then we don't want to set the path blocking,
2733                          * so we test it here
2734                          */
2735                         if (!should_cow_block(trans, root, b)) {
2736                                 trans->dirty = true;
2737                                 goto cow_done;
2738                         }
2739
2740                         /*
2741                          * must have write locks on this node and the
2742                          * parent
2743                          */
2744                         if (level > write_lock_level ||
2745                             (level + 1 > write_lock_level &&
2746                             level + 1 < BTRFS_MAX_LEVEL &&
2747                             p->nodes[level + 1])) {
2748                                 write_lock_level = level + 1;
2749                                 btrfs_release_path(p);
2750                                 goto again;
2751                         }
2752
2753                         btrfs_set_path_blocking(p);
2754                         if (last_level)
2755                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2756                                                       &b);
2757                         else
2758                                 err = btrfs_cow_block(trans, root, b,
2759                                                       p->nodes[level + 1],
2760                                                       p->slots[level + 1], &b);
2761                         if (err) {
2762                                 ret = err;
2763                                 goto done;
2764                         }
2765                 }
2766 cow_done:
2767                 p->nodes[level] = b;
2768                 /*
2769                  * Leave path with blocking locks to avoid massive
2770                  * lock context switch, this is made on purpose.
2771                  */
2772
2773                 /*
2774                  * we have a lock on b and as long as we aren't changing
2775                  * the tree, there is no way to for the items in b to change.
2776                  * It is safe to drop the lock on our parent before we
2777                  * go through the expensive btree search on b.
2778                  *
2779                  * If we're inserting or deleting (ins_len != 0), then we might
2780                  * be changing slot zero, which may require changing the parent.
2781                  * So, we can't drop the lock until after we know which slot
2782                  * we're operating on.
2783                  */
2784                 if (!ins_len && !p->keep_locks) {
2785                         int u = level + 1;
2786
2787                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2788                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2789                                 p->locks[u] = 0;
2790                         }
2791                 }
2792
2793                 ret = key_search(b, key, level, &prev_cmp, &slot);
2794                 if (ret < 0)
2795                         goto done;
2796
2797                 if (level != 0) {
2798                         int dec = 0;
2799                         if (ret && slot > 0) {
2800                                 dec = 1;
2801                                 slot -= 1;
2802                         }
2803                         p->slots[level] = slot;
2804                         err = setup_nodes_for_search(trans, root, p, b, level,
2805                                              ins_len, &write_lock_level);
2806                         if (err == -EAGAIN)
2807                                 goto again;
2808                         if (err) {
2809                                 ret = err;
2810                                 goto done;
2811                         }
2812                         b = p->nodes[level];
2813                         slot = p->slots[level];
2814
2815                         /*
2816                          * slot 0 is special, if we change the key
2817                          * we have to update the parent pointer
2818                          * which means we must have a write lock
2819                          * on the parent
2820                          */
2821                         if (slot == 0 && ins_len &&
2822                             write_lock_level < level + 1) {
2823                                 write_lock_level = level + 1;
2824                                 btrfs_release_path(p);
2825                                 goto again;
2826                         }
2827
2828                         unlock_up(p, level, lowest_unlock,
2829                                   min_write_lock_level, &write_lock_level);
2830
2831                         if (level == lowest_level) {
2832                                 if (dec)
2833                                         p->slots[level]++;
2834                                 goto done;
2835                         }
2836
2837                         err = read_block_for_search(root, p, &b, level,
2838                                                     slot, key);
2839                         if (err == -EAGAIN)
2840                                 goto again;
2841                         if (err) {
2842                                 ret = err;
2843                                 goto done;
2844                         }
2845
2846                         if (!p->skip_locking) {
2847                                 level = btrfs_header_level(b);
2848                                 if (level <= write_lock_level) {
2849                                         err = btrfs_try_tree_write_lock(b);
2850                                         if (!err) {
2851                                                 btrfs_set_path_blocking(p);
2852                                                 btrfs_tree_lock(b);
2853                                         }
2854                                         p->locks[level] = BTRFS_WRITE_LOCK;
2855                                 } else {
2856                                         err = btrfs_tree_read_lock_atomic(b);
2857                                         if (!err) {
2858                                                 btrfs_set_path_blocking(p);
2859                                                 btrfs_tree_read_lock(b);
2860                                         }
2861                                         p->locks[level] = BTRFS_READ_LOCK;
2862                                 }
2863                                 p->nodes[level] = b;
2864                         }
2865                 } else {
2866                         p->slots[level] = slot;
2867                         if (ins_len > 0 &&
2868                             btrfs_leaf_free_space(fs_info, b) < ins_len) {
2869                                 if (write_lock_level < 1) {
2870                                         write_lock_level = 1;
2871                                         btrfs_release_path(p);
2872                                         goto again;
2873                                 }
2874
2875                                 btrfs_set_path_blocking(p);
2876                                 err = split_leaf(trans, root, key,
2877                                                  p, ins_len, ret == 0);
2878
2879                                 BUG_ON(err > 0);
2880                                 if (err) {
2881                                         ret = err;
2882                                         goto done;
2883                                 }
2884                         }
2885                         if (!p->search_for_split)
2886                                 unlock_up(p, level, lowest_unlock,
2887                                           min_write_lock_level, NULL);
2888                         goto done;
2889                 }
2890         }
2891         ret = 1;
2892 done:
2893         /*
2894          * we don't really know what they plan on doing with the path
2895          * from here on, so for now just mark it as blocking
2896          */
2897         if (!p->leave_spinning)
2898                 btrfs_set_path_blocking(p);
2899         if (ret < 0 && !p->skip_release_on_error)
2900                 btrfs_release_path(p);
2901         return ret;
2902 }
2903
2904 /*
2905  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2906  * current state of the tree together with the operations recorded in the tree
2907  * modification log to search for the key in a previous version of this tree, as
2908  * denoted by the time_seq parameter.
2909  *
2910  * Naturally, there is no support for insert, delete or cow operations.
2911  *
2912  * The resulting path and return value will be set up as if we called
2913  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2914  */
2915 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2916                           struct btrfs_path *p, u64 time_seq)
2917 {
2918         struct btrfs_fs_info *fs_info = root->fs_info;
2919         struct extent_buffer *b;
2920         int slot;
2921         int ret;
2922         int err;
2923         int level;
2924         int lowest_unlock = 1;
2925         u8 lowest_level = 0;
2926         int prev_cmp = -1;
2927
2928         lowest_level = p->lowest_level;
2929         WARN_ON(p->nodes[0] != NULL);
2930
2931         if (p->search_commit_root) {
2932                 BUG_ON(time_seq);
2933                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2934         }
2935
2936 again:
2937         b = get_old_root(root, time_seq);
2938         if (!b) {
2939                 ret = -EIO;
2940                 goto done;
2941         }
2942         level = btrfs_header_level(b);
2943         p->locks[level] = BTRFS_READ_LOCK;
2944
2945         while (b) {
2946                 level = btrfs_header_level(b);
2947                 p->nodes[level] = b;
2948
2949                 /*
2950                  * we have a lock on b and as long as we aren't changing
2951                  * the tree, there is no way to for the items in b to change.
2952                  * It is safe to drop the lock on our parent before we
2953                  * go through the expensive btree search on b.
2954                  */
2955                 btrfs_unlock_up_safe(p, level + 1);
2956
2957                 /*
2958                  * Since we can unwind ebs we want to do a real search every
2959                  * time.
2960                  */
2961                 prev_cmp = -1;
2962                 ret = key_search(b, key, level, &prev_cmp, &slot);
2963
2964                 if (level != 0) {
2965                         int dec = 0;
2966                         if (ret && slot > 0) {
2967                                 dec = 1;
2968                                 slot -= 1;
2969                         }
2970                         p->slots[level] = slot;
2971                         unlock_up(p, level, lowest_unlock, 0, NULL);
2972
2973                         if (level == lowest_level) {
2974                                 if (dec)
2975                                         p->slots[level]++;
2976                                 goto done;
2977                         }
2978
2979                         err = read_block_for_search(root, p, &b, level,
2980                                                     slot, key);
2981                         if (err == -EAGAIN)
2982                                 goto again;
2983                         if (err) {
2984                                 ret = err;
2985                                 goto done;
2986                         }
2987
2988                         level = btrfs_header_level(b);
2989                         err = btrfs_tree_read_lock_atomic(b);
2990                         if (!err) {
2991                                 btrfs_set_path_blocking(p);
2992                                 btrfs_tree_read_lock(b);
2993                         }
2994                         b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2995                         if (!b) {
2996                                 ret = -ENOMEM;
2997                                 goto done;
2998                         }
2999                         p->locks[level] = BTRFS_READ_LOCK;
3000                         p->nodes[level] = b;
3001                 } else {
3002                         p->slots[level] = slot;
3003                         unlock_up(p, level, lowest_unlock, 0, NULL);
3004                         goto done;
3005                 }
3006         }
3007         ret = 1;
3008 done:
3009         if (!p->leave_spinning)
3010                 btrfs_set_path_blocking(p);
3011         if (ret < 0)
3012                 btrfs_release_path(p);
3013
3014         return ret;
3015 }
3016
3017 /*
3018  * helper to use instead of search slot if no exact match is needed but
3019  * instead the next or previous item should be returned.
3020  * When find_higher is true, the next higher item is returned, the next lower
3021  * otherwise.
3022  * When return_any and find_higher are both true, and no higher item is found,
3023  * return the next lower instead.
3024  * When return_any is true and find_higher is false, and no lower item is found,
3025  * return the next higher instead.
3026  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3027  * < 0 on error
3028  */
3029 int btrfs_search_slot_for_read(struct btrfs_root *root,
3030                                const struct btrfs_key *key,
3031                                struct btrfs_path *p, int find_higher,
3032                                int return_any)
3033 {
3034         int ret;
3035         struct extent_buffer *leaf;
3036
3037 again:
3038         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3039         if (ret <= 0)
3040                 return ret;
3041         /*
3042          * a return value of 1 means the path is at the position where the
3043          * item should be inserted. Normally this is the next bigger item,
3044          * but in case the previous item is the last in a leaf, path points
3045          * to the first free slot in the previous leaf, i.e. at an invalid
3046          * item.
3047          */
3048         leaf = p->nodes[0];
3049
3050         if (find_higher) {
3051                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3052                         ret = btrfs_next_leaf(root, p);
3053                         if (ret <= 0)
3054                                 return ret;
3055                         if (!return_any)
3056                                 return 1;
3057                         /*
3058                          * no higher item found, return the next
3059                          * lower instead
3060                          */
3061                         return_any = 0;
3062                         find_higher = 0;
3063                         btrfs_release_path(p);
3064                         goto again;
3065                 }
3066         } else {
3067                 if (p->slots[0] == 0) {
3068                         ret = btrfs_prev_leaf(root, p);
3069                         if (ret < 0)
3070                                 return ret;
3071                         if (!ret) {
3072                                 leaf = p->nodes[0];
3073                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3074                                         p->slots[0]--;
3075                                 return 0;
3076                         }
3077                         if (!return_any)
3078                                 return 1;
3079                         /*
3080                          * no lower item found, return the next
3081                          * higher instead
3082                          */
3083                         return_any = 0;
3084                         find_higher = 1;
3085                         btrfs_release_path(p);
3086                         goto again;
3087                 } else {
3088                         --p->slots[0];
3089                 }
3090         }
3091         return 0;
3092 }
3093
3094 /*
3095  * adjust the pointers going up the tree, starting at level
3096  * making sure the right key of each node is points to 'key'.
3097  * This is used after shifting pointers to the left, so it stops
3098  * fixing up pointers when a given leaf/node is not in slot 0 of the
3099  * higher levels
3100  *
3101  */
3102 static void fixup_low_keys(struct btrfs_path *path,
3103                            struct btrfs_disk_key *key, int level)
3104 {
3105         int i;
3106         struct extent_buffer *t;
3107         int ret;
3108
3109         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3110                 int tslot = path->slots[i];
3111
3112                 if (!path->nodes[i])
3113                         break;
3114                 t = path->nodes[i];
3115                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3116                                 GFP_ATOMIC);
3117                 BUG_ON(ret < 0);
3118                 btrfs_set_node_key(t, key, tslot);
3119                 btrfs_mark_buffer_dirty(path->nodes[i]);
3120                 if (tslot != 0)
3121                         break;
3122         }
3123 }
3124
3125 /*
3126  * update item key.
3127  *
3128  * This function isn't completely safe. It's the caller's responsibility
3129  * that the new key won't break the order
3130  */
3131 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3132                              struct btrfs_path *path,
3133                              const struct btrfs_key *new_key)
3134 {
3135         struct btrfs_disk_key disk_key;
3136         struct extent_buffer *eb;
3137         int slot;
3138
3139   &n