btrfs: Remove extraneous extent_buffer_get from tree_mod_log_rewind
[sfrench/cifs-2.6.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15
16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
17                       *root, struct btrfs_path *path, int level);
18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
19                       const struct btrfs_key *ins_key, struct btrfs_path *path,
20                       int data_size, int extend);
21 static int push_node_left(struct btrfs_trans_handle *trans,
22                           struct btrfs_fs_info *fs_info,
23                           struct extent_buffer *dst,
24                           struct extent_buffer *src, int empty);
25 static int balance_node_right(struct btrfs_trans_handle *trans,
26                               struct btrfs_fs_info *fs_info,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 struct btrfs_path *btrfs_alloc_path(void)
33 {
34         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35 }
36
37 /*
38  * set all locked nodes in the path to blocking locks.  This should
39  * be done before scheduling
40  */
41 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42 {
43         int i;
44         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45                 if (!p->nodes[i] || !p->locks[i])
46                         continue;
47                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48                 if (p->locks[i] == BTRFS_READ_LOCK)
49                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
51                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52         }
53 }
54
55 /* this also releases the path */
56 void btrfs_free_path(struct btrfs_path *p)
57 {
58         if (!p)
59                 return;
60         btrfs_release_path(p);
61         kmem_cache_free(btrfs_path_cachep, p);
62 }
63
64 /*
65  * path release drops references on the extent buffers in the path
66  * and it drops any locks held by this path
67  *
68  * It is safe to call this on paths that no locks or extent buffers held.
69  */
70 noinline void btrfs_release_path(struct btrfs_path *p)
71 {
72         int i;
73
74         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75                 p->slots[i] = 0;
76                 if (!p->nodes[i])
77                         continue;
78                 if (p->locks[i]) {
79                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
80                         p->locks[i] = 0;
81                 }
82                 free_extent_buffer(p->nodes[i]);
83                 p->nodes[i] = NULL;
84         }
85 }
86
87 /*
88  * safely gets a reference on the root node of a tree.  A lock
89  * is not taken, so a concurrent writer may put a different node
90  * at the root of the tree.  See btrfs_lock_root_node for the
91  * looping required.
92  *
93  * The extent buffer returned by this has a reference taken, so
94  * it won't disappear.  It may stop being the root of the tree
95  * at any time because there are no locks held.
96  */
97 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98 {
99         struct extent_buffer *eb;
100
101         while (1) {
102                 rcu_read_lock();
103                 eb = rcu_dereference(root->node);
104
105                 /*
106                  * RCU really hurts here, we could free up the root node because
107                  * it was COWed but we may not get the new root node yet so do
108                  * the inc_not_zero dance and if it doesn't work then
109                  * synchronize_rcu and try again.
110                  */
111                 if (atomic_inc_not_zero(&eb->refs)) {
112                         rcu_read_unlock();
113                         break;
114                 }
115                 rcu_read_unlock();
116                 synchronize_rcu();
117         }
118         return eb;
119 }
120
121 /* loop around taking references on and locking the root node of the
122  * tree until you end up with a lock on the root.  A locked buffer
123  * is returned, with a reference held.
124  */
125 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
126 {
127         struct extent_buffer *eb;
128
129         while (1) {
130                 eb = btrfs_root_node(root);
131                 btrfs_tree_lock(eb);
132                 if (eb == root->node)
133                         break;
134                 btrfs_tree_unlock(eb);
135                 free_extent_buffer(eb);
136         }
137         return eb;
138 }
139
140 /* loop around taking references on and locking the root node of the
141  * tree until you end up with a lock on the root.  A locked buffer
142  * is returned, with a reference held.
143  */
144 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
145 {
146         struct extent_buffer *eb;
147
148         while (1) {
149                 eb = btrfs_root_node(root);
150                 btrfs_tree_read_lock(eb);
151                 if (eb == root->node)
152                         break;
153                 btrfs_tree_read_unlock(eb);
154                 free_extent_buffer(eb);
155         }
156         return eb;
157 }
158
159 /* cowonly root (everything not a reference counted cow subvolume), just get
160  * put onto a simple dirty list.  transaction.c walks this to make sure they
161  * get properly updated on disk.
162  */
163 static void add_root_to_dirty_list(struct btrfs_root *root)
164 {
165         struct btrfs_fs_info *fs_info = root->fs_info;
166
167         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
168             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
169                 return;
170
171         spin_lock(&fs_info->trans_lock);
172         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
173                 /* Want the extent tree to be the last on the list */
174                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
175                         list_move_tail(&root->dirty_list,
176                                        &fs_info->dirty_cowonly_roots);
177                 else
178                         list_move(&root->dirty_list,
179                                   &fs_info->dirty_cowonly_roots);
180         }
181         spin_unlock(&fs_info->trans_lock);
182 }
183
184 /*
185  * used by snapshot creation to make a copy of a root for a tree with
186  * a given objectid.  The buffer with the new root node is returned in
187  * cow_ret, and this func returns zero on success or a negative error code.
188  */
189 int btrfs_copy_root(struct btrfs_trans_handle *trans,
190                       struct btrfs_root *root,
191                       struct extent_buffer *buf,
192                       struct extent_buffer **cow_ret, u64 new_root_objectid)
193 {
194         struct btrfs_fs_info *fs_info = root->fs_info;
195         struct extent_buffer *cow;
196         int ret = 0;
197         int level;
198         struct btrfs_disk_key disk_key;
199
200         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
201                 trans->transid != fs_info->running_transaction->transid);
202         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
203                 trans->transid != root->last_trans);
204
205         level = btrfs_header_level(buf);
206         if (level == 0)
207                 btrfs_item_key(buf, &disk_key, 0);
208         else
209                 btrfs_node_key(buf, &disk_key, 0);
210
211         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
212                         &disk_key, level, buf->start, 0);
213         if (IS_ERR(cow))
214                 return PTR_ERR(cow);
215
216         copy_extent_buffer_full(cow, buf);
217         btrfs_set_header_bytenr(cow, cow->start);
218         btrfs_set_header_generation(cow, trans->transid);
219         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
220         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
221                                      BTRFS_HEADER_FLAG_RELOC);
222         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
223                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
224         else
225                 btrfs_set_header_owner(cow, new_root_objectid);
226
227         write_extent_buffer_fsid(cow, fs_info->fsid);
228
229         WARN_ON(btrfs_header_generation(buf) > trans->transid);
230         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
231                 ret = btrfs_inc_ref(trans, root, cow, 1);
232         else
233                 ret = btrfs_inc_ref(trans, root, cow, 0);
234
235         if (ret)
236                 return ret;
237
238         btrfs_mark_buffer_dirty(cow);
239         *cow_ret = cow;
240         return 0;
241 }
242
243 enum mod_log_op {
244         MOD_LOG_KEY_REPLACE,
245         MOD_LOG_KEY_ADD,
246         MOD_LOG_KEY_REMOVE,
247         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
248         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
249         MOD_LOG_MOVE_KEYS,
250         MOD_LOG_ROOT_REPLACE,
251 };
252
253 struct tree_mod_root {
254         u64 logical;
255         u8 level;
256 };
257
258 struct tree_mod_elem {
259         struct rb_node node;
260         u64 logical;
261         u64 seq;
262         enum mod_log_op op;
263
264         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
265         int slot;
266
267         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
268         u64 generation;
269
270         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
271         struct btrfs_disk_key key;
272         u64 blockptr;
273
274         /* this is used for op == MOD_LOG_MOVE_KEYS */
275         struct {
276                 int dst_slot;
277                 int nr_items;
278         } move;
279
280         /* this is used for op == MOD_LOG_ROOT_REPLACE */
281         struct tree_mod_root old_root;
282 };
283
284 /*
285  * Pull a new tree mod seq number for our operation.
286  */
287 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
288 {
289         return atomic64_inc_return(&fs_info->tree_mod_seq);
290 }
291
292 /*
293  * This adds a new blocker to the tree mod log's blocker list if the @elem
294  * passed does not already have a sequence number set. So when a caller expects
295  * to record tree modifications, it should ensure to set elem->seq to zero
296  * before calling btrfs_get_tree_mod_seq.
297  * Returns a fresh, unused tree log modification sequence number, even if no new
298  * blocker was added.
299  */
300 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
301                            struct seq_list *elem)
302 {
303         write_lock(&fs_info->tree_mod_log_lock);
304         spin_lock(&fs_info->tree_mod_seq_lock);
305         if (!elem->seq) {
306                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
307                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
308         }
309         spin_unlock(&fs_info->tree_mod_seq_lock);
310         write_unlock(&fs_info->tree_mod_log_lock);
311
312         return elem->seq;
313 }
314
315 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
316                             struct seq_list *elem)
317 {
318         struct rb_root *tm_root;
319         struct rb_node *node;
320         struct rb_node *next;
321         struct seq_list *cur_elem;
322         struct tree_mod_elem *tm;
323         u64 min_seq = (u64)-1;
324         u64 seq_putting = elem->seq;
325
326         if (!seq_putting)
327                 return;
328
329         spin_lock(&fs_info->tree_mod_seq_lock);
330         list_del(&elem->list);
331         elem->seq = 0;
332
333         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
334                 if (cur_elem->seq < min_seq) {
335                         if (seq_putting > cur_elem->seq) {
336                                 /*
337                                  * blocker with lower sequence number exists, we
338                                  * cannot remove anything from the log
339                                  */
340                                 spin_unlock(&fs_info->tree_mod_seq_lock);
341                                 return;
342                         }
343                         min_seq = cur_elem->seq;
344                 }
345         }
346         spin_unlock(&fs_info->tree_mod_seq_lock);
347
348         /*
349          * anything that's lower than the lowest existing (read: blocked)
350          * sequence number can be removed from the tree.
351          */
352         write_lock(&fs_info->tree_mod_log_lock);
353         tm_root = &fs_info->tree_mod_log;
354         for (node = rb_first(tm_root); node; node = next) {
355                 next = rb_next(node);
356                 tm = rb_entry(node, struct tree_mod_elem, node);
357                 if (tm->seq > min_seq)
358                         continue;
359                 rb_erase(node, tm_root);
360                 kfree(tm);
361         }
362         write_unlock(&fs_info->tree_mod_log_lock);
363 }
364
365 /*
366  * key order of the log:
367  *       node/leaf start address -> sequence
368  *
369  * The 'start address' is the logical address of the *new* root node
370  * for root replace operations, or the logical address of the affected
371  * block for all other operations.
372  *
373  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
374  */
375 static noinline int
376 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
377 {
378         struct rb_root *tm_root;
379         struct rb_node **new;
380         struct rb_node *parent = NULL;
381         struct tree_mod_elem *cur;
382
383         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
384
385         tm_root = &fs_info->tree_mod_log;
386         new = &tm_root->rb_node;
387         while (*new) {
388                 cur = rb_entry(*new, struct tree_mod_elem, node);
389                 parent = *new;
390                 if (cur->logical < tm->logical)
391                         new = &((*new)->rb_left);
392                 else if (cur->logical > tm->logical)
393                         new = &((*new)->rb_right);
394                 else if (cur->seq < tm->seq)
395                         new = &((*new)->rb_left);
396                 else if (cur->seq > tm->seq)
397                         new = &((*new)->rb_right);
398                 else
399                         return -EEXIST;
400         }
401
402         rb_link_node(&tm->node, parent, new);
403         rb_insert_color(&tm->node, tm_root);
404         return 0;
405 }
406
407 /*
408  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
409  * returns zero with the tree_mod_log_lock acquired. The caller must hold
410  * this until all tree mod log insertions are recorded in the rb tree and then
411  * write unlock fs_info::tree_mod_log_lock.
412  */
413 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
414                                     struct extent_buffer *eb) {
415         smp_mb();
416         if (list_empty(&(fs_info)->tree_mod_seq_list))
417                 return 1;
418         if (eb && btrfs_header_level(eb) == 0)
419                 return 1;
420
421         write_lock(&fs_info->tree_mod_log_lock);
422         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
423                 write_unlock(&fs_info->tree_mod_log_lock);
424                 return 1;
425         }
426
427         return 0;
428 }
429
430 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
431 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
432                                     struct extent_buffer *eb)
433 {
434         smp_mb();
435         if (list_empty(&(fs_info)->tree_mod_seq_list))
436                 return 0;
437         if (eb && btrfs_header_level(eb) == 0)
438                 return 0;
439
440         return 1;
441 }
442
443 static struct tree_mod_elem *
444 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
445                     enum mod_log_op op, gfp_t flags)
446 {
447         struct tree_mod_elem *tm;
448
449         tm = kzalloc(sizeof(*tm), flags);
450         if (!tm)
451                 return NULL;
452
453         tm->logical = eb->start;
454         if (op != MOD_LOG_KEY_ADD) {
455                 btrfs_node_key(eb, &tm->key, slot);
456                 tm->blockptr = btrfs_node_blockptr(eb, slot);
457         }
458         tm->op = op;
459         tm->slot = slot;
460         tm->generation = btrfs_node_ptr_generation(eb, slot);
461         RB_CLEAR_NODE(&tm->node);
462
463         return tm;
464 }
465
466 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
467                 enum mod_log_op op, gfp_t flags)
468 {
469         struct tree_mod_elem *tm;
470         int ret;
471
472         if (!tree_mod_need_log(eb->fs_info, eb))
473                 return 0;
474
475         tm = alloc_tree_mod_elem(eb, slot, op, flags);
476         if (!tm)
477                 return -ENOMEM;
478
479         if (tree_mod_dont_log(eb->fs_info, eb)) {
480                 kfree(tm);
481                 return 0;
482         }
483
484         ret = __tree_mod_log_insert(eb->fs_info, tm);
485         write_unlock(&eb->fs_info->tree_mod_log_lock);
486         if (ret)
487                 kfree(tm);
488
489         return ret;
490 }
491
492 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
493                 int dst_slot, int src_slot, int nr_items)
494 {
495         struct tree_mod_elem *tm = NULL;
496         struct tree_mod_elem **tm_list = NULL;
497         int ret = 0;
498         int i;
499         int locked = 0;
500
501         if (!tree_mod_need_log(eb->fs_info, eb))
502                 return 0;
503
504         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
505         if (!tm_list)
506                 return -ENOMEM;
507
508         tm = kzalloc(sizeof(*tm), GFP_NOFS);
509         if (!tm) {
510                 ret = -ENOMEM;
511                 goto free_tms;
512         }
513
514         tm->logical = eb->start;
515         tm->slot = src_slot;
516         tm->move.dst_slot = dst_slot;
517         tm->move.nr_items = nr_items;
518         tm->op = MOD_LOG_MOVE_KEYS;
519
520         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
521                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
522                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
523                 if (!tm_list[i]) {
524                         ret = -ENOMEM;
525                         goto free_tms;
526                 }
527         }
528
529         if (tree_mod_dont_log(eb->fs_info, eb))
530                 goto free_tms;
531         locked = 1;
532
533         /*
534          * When we override something during the move, we log these removals.
535          * This can only happen when we move towards the beginning of the
536          * buffer, i.e. dst_slot < src_slot.
537          */
538         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
539                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
540                 if (ret)
541                         goto free_tms;
542         }
543
544         ret = __tree_mod_log_insert(eb->fs_info, tm);
545         if (ret)
546                 goto free_tms;
547         write_unlock(&eb->fs_info->tree_mod_log_lock);
548         kfree(tm_list);
549
550         return 0;
551 free_tms:
552         for (i = 0; i < nr_items; i++) {
553                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
554                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
555                 kfree(tm_list[i]);
556         }
557         if (locked)
558                 write_unlock(&eb->fs_info->tree_mod_log_lock);
559         kfree(tm_list);
560         kfree(tm);
561
562         return ret;
563 }
564
565 static inline int
566 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
567                        struct tree_mod_elem **tm_list,
568                        int nritems)
569 {
570         int i, j;
571         int ret;
572
573         for (i = nritems - 1; i >= 0; i--) {
574                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
575                 if (ret) {
576                         for (j = nritems - 1; j > i; j--)
577                                 rb_erase(&tm_list[j]->node,
578                                          &fs_info->tree_mod_log);
579                         return ret;
580                 }
581         }
582
583         return 0;
584 }
585
586 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
587                          struct extent_buffer *new_root, int log_removal)
588 {
589         struct btrfs_fs_info *fs_info = old_root->fs_info;
590         struct tree_mod_elem *tm = NULL;
591         struct tree_mod_elem **tm_list = NULL;
592         int nritems = 0;
593         int ret = 0;
594         int i;
595
596         if (!tree_mod_need_log(fs_info, NULL))
597                 return 0;
598
599         if (log_removal && btrfs_header_level(old_root) > 0) {
600                 nritems = btrfs_header_nritems(old_root);
601                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
602                                   GFP_NOFS);
603                 if (!tm_list) {
604                         ret = -ENOMEM;
605                         goto free_tms;
606                 }
607                 for (i = 0; i < nritems; i++) {
608                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
609                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
610                         if (!tm_list[i]) {
611                                 ret = -ENOMEM;
612                                 goto free_tms;
613                         }
614                 }
615         }
616
617         tm = kzalloc(sizeof(*tm), GFP_NOFS);
618         if (!tm) {
619                 ret = -ENOMEM;
620                 goto free_tms;
621         }
622
623         tm->logical = new_root->start;
624         tm->old_root.logical = old_root->start;
625         tm->old_root.level = btrfs_header_level(old_root);
626         tm->generation = btrfs_header_generation(old_root);
627         tm->op = MOD_LOG_ROOT_REPLACE;
628
629         if (tree_mod_dont_log(fs_info, NULL))
630                 goto free_tms;
631
632         if (tm_list)
633                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
634         if (!ret)
635                 ret = __tree_mod_log_insert(fs_info, tm);
636
637         write_unlock(&fs_info->tree_mod_log_lock);
638         if (ret)
639                 goto free_tms;
640         kfree(tm_list);
641
642         return ret;
643
644 free_tms:
645         if (tm_list) {
646                 for (i = 0; i < nritems; i++)
647                         kfree(tm_list[i]);
648                 kfree(tm_list);
649         }
650         kfree(tm);
651
652         return ret;
653 }
654
655 static struct tree_mod_elem *
656 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
657                       int smallest)
658 {
659         struct rb_root *tm_root;
660         struct rb_node *node;
661         struct tree_mod_elem *cur = NULL;
662         struct tree_mod_elem *found = NULL;
663
664         read_lock(&fs_info->tree_mod_log_lock);
665         tm_root = &fs_info->tree_mod_log;
666         node = tm_root->rb_node;
667         while (node) {
668                 cur = rb_entry(node, struct tree_mod_elem, node);
669                 if (cur->logical < start) {
670                         node = node->rb_left;
671                 } else if (cur->logical > start) {
672                         node = node->rb_right;
673                 } else if (cur->seq < min_seq) {
674                         node = node->rb_left;
675                 } else if (!smallest) {
676                         /* we want the node with the highest seq */
677                         if (found)
678                                 BUG_ON(found->seq > cur->seq);
679                         found = cur;
680                         node = node->rb_left;
681                 } else if (cur->seq > min_seq) {
682                         /* we want the node with the smallest seq */
683                         if (found)
684                                 BUG_ON(found->seq < cur->seq);
685                         found = cur;
686                         node = node->rb_right;
687                 } else {
688                         found = cur;
689                         break;
690                 }
691         }
692         read_unlock(&fs_info->tree_mod_log_lock);
693
694         return found;
695 }
696
697 /*
698  * this returns the element from the log with the smallest time sequence
699  * value that's in the log (the oldest log item). any element with a time
700  * sequence lower than min_seq will be ignored.
701  */
702 static struct tree_mod_elem *
703 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
704                            u64 min_seq)
705 {
706         return __tree_mod_log_search(fs_info, start, min_seq, 1);
707 }
708
709 /*
710  * this returns the element from the log with the largest time sequence
711  * value that's in the log (the most recent log item). any element with
712  * a time sequence lower than min_seq will be ignored.
713  */
714 static struct tree_mod_elem *
715 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
716 {
717         return __tree_mod_log_search(fs_info, start, min_seq, 0);
718 }
719
720 static noinline int
721 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
722                      struct extent_buffer *src, unsigned long dst_offset,
723                      unsigned long src_offset, int nr_items)
724 {
725         int ret = 0;
726         struct tree_mod_elem **tm_list = NULL;
727         struct tree_mod_elem **tm_list_add, **tm_list_rem;
728         int i;
729         int locked = 0;
730
731         if (!tree_mod_need_log(fs_info, NULL))
732                 return 0;
733
734         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
735                 return 0;
736
737         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
738                           GFP_NOFS);
739         if (!tm_list)
740                 return -ENOMEM;
741
742         tm_list_add = tm_list;
743         tm_list_rem = tm_list + nr_items;
744         for (i = 0; i < nr_items; i++) {
745                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
746                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
747                 if (!tm_list_rem[i]) {
748                         ret = -ENOMEM;
749                         goto free_tms;
750                 }
751
752                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
753                     MOD_LOG_KEY_ADD, GFP_NOFS);
754                 if (!tm_list_add[i]) {
755                         ret = -ENOMEM;
756                         goto free_tms;
757                 }
758         }
759
760         if (tree_mod_dont_log(fs_info, NULL))
761                 goto free_tms;
762         locked = 1;
763
764         for (i = 0; i < nr_items; i++) {
765                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
766                 if (ret)
767                         goto free_tms;
768                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
769                 if (ret)
770                         goto free_tms;
771         }
772
773         write_unlock(&fs_info->tree_mod_log_lock);
774         kfree(tm_list);
775
776         return 0;
777
778 free_tms:
779         for (i = 0; i < nr_items * 2; i++) {
780                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
781                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
782                 kfree(tm_list[i]);
783         }
784         if (locked)
785                 write_unlock(&fs_info->tree_mod_log_lock);
786         kfree(tm_list);
787
788         return ret;
789 }
790
791 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
792 {
793         struct tree_mod_elem **tm_list = NULL;
794         int nritems = 0;
795         int i;
796         int ret = 0;
797
798         if (btrfs_header_level(eb) == 0)
799                 return 0;
800
801         if (!tree_mod_need_log(eb->fs_info, NULL))
802                 return 0;
803
804         nritems = btrfs_header_nritems(eb);
805         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
806         if (!tm_list)
807                 return -ENOMEM;
808
809         for (i = 0; i < nritems; i++) {
810                 tm_list[i] = alloc_tree_mod_elem(eb, i,
811                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
812                 if (!tm_list[i]) {
813                         ret = -ENOMEM;
814                         goto free_tms;
815                 }
816         }
817
818         if (tree_mod_dont_log(eb->fs_info, eb))
819                 goto free_tms;
820
821         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
822         write_unlock(&eb->fs_info->tree_mod_log_lock);
823         if (ret)
824                 goto free_tms;
825         kfree(tm_list);
826
827         return 0;
828
829 free_tms:
830         for (i = 0; i < nritems; i++)
831                 kfree(tm_list[i]);
832         kfree(tm_list);
833
834         return ret;
835 }
836
837 /*
838  * check if the tree block can be shared by multiple trees
839  */
840 int btrfs_block_can_be_shared(struct btrfs_root *root,
841                               struct extent_buffer *buf)
842 {
843         /*
844          * Tree blocks not in reference counted trees and tree roots
845          * are never shared. If a block was allocated after the last
846          * snapshot and the block was not allocated by tree relocation,
847          * we know the block is not shared.
848          */
849         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
850             buf != root->node && buf != root->commit_root &&
851             (btrfs_header_generation(buf) <=
852              btrfs_root_last_snapshot(&root->root_item) ||
853              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
854                 return 1;
855
856         return 0;
857 }
858
859 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
860                                        struct btrfs_root *root,
861                                        struct extent_buffer *buf,
862                                        struct extent_buffer *cow,
863                                        int *last_ref)
864 {
865         struct btrfs_fs_info *fs_info = root->fs_info;
866         u64 refs;
867         u64 owner;
868         u64 flags;
869         u64 new_flags = 0;
870         int ret;
871
872         /*
873          * Backrefs update rules:
874          *
875          * Always use full backrefs for extent pointers in tree block
876          * allocated by tree relocation.
877          *
878          * If a shared tree block is no longer referenced by its owner
879          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
880          * use full backrefs for extent pointers in tree block.
881          *
882          * If a tree block is been relocating
883          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
884          * use full backrefs for extent pointers in tree block.
885          * The reason for this is some operations (such as drop tree)
886          * are only allowed for blocks use full backrefs.
887          */
888
889         if (btrfs_block_can_be_shared(root, buf)) {
890                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
891                                                btrfs_header_level(buf), 1,
892                                                &refs, &flags);
893                 if (ret)
894                         return ret;
895                 if (refs == 0) {
896                         ret = -EROFS;
897                         btrfs_handle_fs_error(fs_info, ret, NULL);
898                         return ret;
899                 }
900         } else {
901                 refs = 1;
902                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
903                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
904                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
905                 else
906                         flags = 0;
907         }
908
909         owner = btrfs_header_owner(buf);
910         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
911                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
912
913         if (refs > 1) {
914                 if ((owner == root->root_key.objectid ||
915                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
916                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
917                         ret = btrfs_inc_ref(trans, root, buf, 1);
918                         if (ret)
919                                 return ret;
920
921                         if (root->root_key.objectid ==
922                             BTRFS_TREE_RELOC_OBJECTID) {
923                                 ret = btrfs_dec_ref(trans, root, buf, 0);
924                                 if (ret)
925                                         return ret;
926                                 ret = btrfs_inc_ref(trans, root, cow, 1);
927                                 if (ret)
928                                         return ret;
929                         }
930                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
931                 } else {
932
933                         if (root->root_key.objectid ==
934                             BTRFS_TREE_RELOC_OBJECTID)
935                                 ret = btrfs_inc_ref(trans, root, cow, 1);
936                         else
937                                 ret = btrfs_inc_ref(trans, root, cow, 0);
938                         if (ret)
939                                 return ret;
940                 }
941                 if (new_flags != 0) {
942                         int level = btrfs_header_level(buf);
943
944                         ret = btrfs_set_disk_extent_flags(trans, fs_info,
945                                                           buf->start,
946                                                           buf->len,
947                                                           new_flags, level, 0);
948                         if (ret)
949                                 return ret;
950                 }
951         } else {
952                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
953                         if (root->root_key.objectid ==
954                             BTRFS_TREE_RELOC_OBJECTID)
955                                 ret = btrfs_inc_ref(trans, root, cow, 1);
956                         else
957                                 ret = btrfs_inc_ref(trans, root, cow, 0);
958                         if (ret)
959                                 return ret;
960                         ret = btrfs_dec_ref(trans, root, buf, 1);
961                         if (ret)
962                                 return ret;
963                 }
964                 clean_tree_block(fs_info, buf);
965                 *last_ref = 1;
966         }
967         return 0;
968 }
969
970 /*
971  * does the dirty work in cow of a single block.  The parent block (if
972  * supplied) is updated to point to the new cow copy.  The new buffer is marked
973  * dirty and returned locked.  If you modify the block it needs to be marked
974  * dirty again.
975  *
976  * search_start -- an allocation hint for the new block
977  *
978  * empty_size -- a hint that you plan on doing more cow.  This is the size in
979  * bytes the allocator should try to find free next to the block it returns.
980  * This is just a hint and may be ignored by the allocator.
981  */
982 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
983                              struct btrfs_root *root,
984                              struct extent_buffer *buf,
985                              struct extent_buffer *parent, int parent_slot,
986                              struct extent_buffer **cow_ret,
987                              u64 search_start, u64 empty_size)
988 {
989         struct btrfs_fs_info *fs_info = root->fs_info;
990         struct btrfs_disk_key disk_key;
991         struct extent_buffer *cow;
992         int level, ret;
993         int last_ref = 0;
994         int unlock_orig = 0;
995         u64 parent_start = 0;
996
997         if (*cow_ret == buf)
998                 unlock_orig = 1;
999
1000         btrfs_assert_tree_locked(buf);
1001
1002         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1003                 trans->transid != fs_info->running_transaction->transid);
1004         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1005                 trans->transid != root->last_trans);
1006
1007         level = btrfs_header_level(buf);
1008
1009         if (level == 0)
1010                 btrfs_item_key(buf, &disk_key, 0);
1011         else
1012                 btrfs_node_key(buf, &disk_key, 0);
1013
1014         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1015                 parent_start = parent->start;
1016
1017         /*
1018          * If we are COWing a node/leaf from the extent, chunk or device trees,
1019          * make sure that we do not finish block group creation of pending block
1020          * groups. We do this to avoid a deadlock.
1021          * COWing can result in allocation of a new chunk, and flushing pending
1022          * block groups (btrfs_create_pending_block_groups()) can be triggered
1023          * when finishing allocation of a new chunk. Creation of a pending block
1024          * group modifies the extent, chunk and device trees, therefore we could
1025          * deadlock with ourselves since we are holding a lock on an extent
1026          * buffer that btrfs_create_pending_block_groups() may try to COW later.
1027          */
1028         if (root == fs_info->extent_root ||
1029             root == fs_info->chunk_root ||
1030             root == fs_info->dev_root)
1031                 trans->can_flush_pending_bgs = false;
1032
1033         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1034                         root->root_key.objectid, &disk_key, level,
1035                         search_start, empty_size);
1036         trans->can_flush_pending_bgs = true;
1037         if (IS_ERR(cow))
1038                 return PTR_ERR(cow);
1039
1040         /* cow is set to blocking by btrfs_init_new_buffer */
1041
1042         copy_extent_buffer_full(cow, buf);
1043         btrfs_set_header_bytenr(cow, cow->start);
1044         btrfs_set_header_generation(cow, trans->transid);
1045         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1046         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1047                                      BTRFS_HEADER_FLAG_RELOC);
1048         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1049                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1050         else
1051                 btrfs_set_header_owner(cow, root->root_key.objectid);
1052
1053         write_extent_buffer_fsid(cow, fs_info->fsid);
1054
1055         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1056         if (ret) {
1057                 btrfs_abort_transaction(trans, ret);
1058                 return ret;
1059         }
1060
1061         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1062                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1063                 if (ret) {
1064                         btrfs_abort_transaction(trans, ret);
1065                         return ret;
1066                 }
1067         }
1068
1069         if (buf == root->node) {
1070                 WARN_ON(parent && parent != buf);
1071                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1072                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1073                         parent_start = buf->start;
1074
1075                 extent_buffer_get(cow);
1076                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1077                 BUG_ON(ret < 0);
1078                 rcu_assign_pointer(root->node, cow);
1079
1080                 btrfs_free_tree_block(trans, root, buf, parent_start,
1081                                       last_ref);
1082                 free_extent_buffer(buf);
1083                 add_root_to_dirty_list(root);
1084         } else {
1085                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1086                 tree_mod_log_insert_key(parent, parent_slot,
1087                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1088                 btrfs_set_node_blockptr(parent, parent_slot,
1089                                         cow->start);
1090                 btrfs_set_node_ptr_generation(parent, parent_slot,
1091                                               trans->transid);
1092                 btrfs_mark_buffer_dirty(parent);
1093                 if (last_ref) {
1094                         ret = tree_mod_log_free_eb(buf);
1095                         if (ret) {
1096                                 btrfs_abort_transaction(trans, ret);
1097                                 return ret;
1098                         }
1099                 }
1100                 btrfs_free_tree_block(trans, root, buf, parent_start,
1101                                       last_ref);
1102         }
1103         if (unlock_orig)
1104                 btrfs_tree_unlock(buf);
1105         free_extent_buffer_stale(buf);
1106         btrfs_mark_buffer_dirty(cow);
1107         *cow_ret = cow;
1108         return 0;
1109 }
1110
1111 /*
1112  * returns the logical address of the oldest predecessor of the given root.
1113  * entries older than time_seq are ignored.
1114  */
1115 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1116                 struct extent_buffer *eb_root, u64 time_seq)
1117 {
1118         struct tree_mod_elem *tm;
1119         struct tree_mod_elem *found = NULL;
1120         u64 root_logical = eb_root->start;
1121         int looped = 0;
1122
1123         if (!time_seq)
1124                 return NULL;
1125
1126         /*
1127          * the very last operation that's logged for a root is the
1128          * replacement operation (if it is replaced at all). this has
1129          * the logical address of the *new* root, making it the very
1130          * first operation that's logged for this root.
1131          */
1132         while (1) {
1133                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1134                                                 time_seq);
1135                 if (!looped && !tm)
1136                         return NULL;
1137                 /*
1138                  * if there are no tree operation for the oldest root, we simply
1139                  * return it. this should only happen if that (old) root is at
1140                  * level 0.
1141                  */
1142                 if (!tm)
1143                         break;
1144
1145                 /*
1146                  * if there's an operation that's not a root replacement, we
1147                  * found the oldest version of our root. normally, we'll find a
1148                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1149                  */
1150                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1151                         break;
1152
1153                 found = tm;
1154                 root_logical = tm->old_root.logical;
1155                 looped = 1;
1156         }
1157
1158         /* if there's no old root to return, return what we found instead */
1159         if (!found)
1160                 found = tm;
1161
1162         return found;
1163 }
1164
1165 /*
1166  * tm is a pointer to the first operation to rewind within eb. then, all
1167  * previous operations will be rewound (until we reach something older than
1168  * time_seq).
1169  */
1170 static void
1171 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1172                       u64 time_seq, struct tree_mod_elem *first_tm)
1173 {
1174         u32 n;
1175         struct rb_node *next;
1176         struct tree_mod_elem *tm = first_tm;
1177         unsigned long o_dst;
1178         unsigned long o_src;
1179         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1180
1181         n = btrfs_header_nritems(eb);
1182         read_lock(&fs_info->tree_mod_log_lock);
1183         while (tm && tm->seq >= time_seq) {
1184                 /*
1185                  * all the operations are recorded with the operator used for
1186                  * the modification. as we're going backwards, we do the
1187                  * opposite of each operation here.
1188                  */
1189                 switch (tm->op) {
1190                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1191                         BUG_ON(tm->slot < n);
1192                         /* Fallthrough */
1193                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1194                 case MOD_LOG_KEY_REMOVE:
1195                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1196                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1197                         btrfs_set_node_ptr_generation(eb, tm->slot,
1198                                                       tm->generation);
1199                         n++;
1200                         break;
1201                 case MOD_LOG_KEY_REPLACE:
1202                         BUG_ON(tm->slot >= n);
1203                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1204                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1205                         btrfs_set_node_ptr_generation(eb, tm->slot,
1206                                                       tm->generation);
1207                         break;
1208                 case MOD_LOG_KEY_ADD:
1209                         /* if a move operation is needed it's in the log */
1210                         n--;
1211                         break;
1212                 case MOD_LOG_MOVE_KEYS:
1213                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1214                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1215                         memmove_extent_buffer(eb, o_dst, o_src,
1216                                               tm->move.nr_items * p_size);
1217                         break;
1218                 case MOD_LOG_ROOT_REPLACE:
1219                         /*
1220                          * this operation is special. for roots, this must be
1221                          * handled explicitly before rewinding.
1222                          * for non-roots, this operation may exist if the node
1223                          * was a root: root A -> child B; then A gets empty and
1224                          * B is promoted to the new root. in the mod log, we'll
1225                          * have a root-replace operation for B, a tree block
1226                          * that is no root. we simply ignore that operation.
1227                          */
1228                         break;
1229                 }
1230                 next = rb_next(&tm->node);
1231                 if (!next)
1232                         break;
1233                 tm = rb_entry(next, struct tree_mod_elem, node);
1234                 if (tm->logical != first_tm->logical)
1235                         break;
1236         }
1237         read_unlock(&fs_info->tree_mod_log_lock);
1238         btrfs_set_header_nritems(eb, n);
1239 }
1240
1241 /*
1242  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1243  * is returned. If rewind operations happen, a fresh buffer is returned. The
1244  * returned buffer is always read-locked. If the returned buffer is not the
1245  * input buffer, the lock on the input buffer is released and the input buffer
1246  * is freed (its refcount is decremented).
1247  */
1248 static struct extent_buffer *
1249 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1250                     struct extent_buffer *eb, u64 time_seq)
1251 {
1252         struct extent_buffer *eb_rewin;
1253         struct tree_mod_elem *tm;
1254
1255         if (!time_seq)
1256                 return eb;
1257
1258         if (btrfs_header_level(eb) == 0)
1259                 return eb;
1260
1261         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1262         if (!tm)
1263                 return eb;
1264
1265         btrfs_set_path_blocking(path);
1266         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1267
1268         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1269                 BUG_ON(tm->slot != 0);
1270                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1271                 if (!eb_rewin) {
1272                         btrfs_tree_read_unlock_blocking(eb);
1273                         free_extent_buffer(eb);
1274                         return NULL;
1275                 }
1276                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1277                 btrfs_set_header_backref_rev(eb_rewin,
1278                                              btrfs_header_backref_rev(eb));
1279                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1280                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1281         } else {
1282                 eb_rewin = btrfs_clone_extent_buffer(eb);
1283                 if (!eb_rewin) {
1284                         btrfs_tree_read_unlock_blocking(eb);
1285                         free_extent_buffer(eb);
1286                         return NULL;
1287                 }
1288         }
1289
1290         btrfs_tree_read_unlock_blocking(eb);
1291         free_extent_buffer(eb);
1292
1293         btrfs_tree_read_lock(eb_rewin);
1294         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1295         WARN_ON(btrfs_header_nritems(eb_rewin) >
1296                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1297
1298         return eb_rewin;
1299 }
1300
1301 /*
1302  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1303  * value. If there are no changes, the current root->root_node is returned. If
1304  * anything changed in between, there's a fresh buffer allocated on which the
1305  * rewind operations are done. In any case, the returned buffer is read locked.
1306  * Returns NULL on error (with no locks held).
1307  */
1308 static inline struct extent_buffer *
1309 get_old_root(struct btrfs_root *root, u64 time_seq)
1310 {
1311         struct btrfs_fs_info *fs_info = root->fs_info;
1312         struct tree_mod_elem *tm;
1313         struct extent_buffer *eb = NULL;
1314         struct extent_buffer *eb_root;
1315         struct extent_buffer *old;
1316         struct tree_mod_root *old_root = NULL;
1317         u64 old_generation = 0;
1318         u64 logical;
1319         int level;
1320
1321         eb_root = btrfs_read_lock_root_node(root);
1322         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1323         if (!tm)
1324                 return eb_root;
1325
1326         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1327                 old_root = &tm->old_root;
1328                 old_generation = tm->generation;
1329                 logical = old_root->logical;
1330                 level = old_root->level;
1331         } else {
1332                 logical = eb_root->start;
1333                 level = btrfs_header_level(eb_root);
1334         }
1335
1336         tm = tree_mod_log_search(fs_info, logical, time_seq);
1337         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1338                 btrfs_tree_read_unlock(eb_root);
1339                 free_extent_buffer(eb_root);
1340                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1341                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1342                         if (!IS_ERR(old))
1343                                 free_extent_buffer(old);
1344                         btrfs_warn(fs_info,
1345                                    "failed to read tree block %llu from get_old_root",
1346                                    logical);
1347                 } else {
1348                         eb = btrfs_clone_extent_buffer(old);
1349                         free_extent_buffer(old);
1350                 }
1351         } else if (old_root) {
1352                 btrfs_tree_read_unlock(eb_root);
1353                 free_extent_buffer(eb_root);
1354                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1355         } else {
1356                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1357                 eb = btrfs_clone_extent_buffer(eb_root);
1358                 btrfs_tree_read_unlock_blocking(eb_root);
1359                 free_extent_buffer(eb_root);
1360         }
1361
1362         if (!eb)
1363                 return NULL;
1364         btrfs_tree_read_lock(eb);
1365         if (old_root) {
1366                 btrfs_set_header_bytenr(eb, eb->start);
1367                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1368                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1369                 btrfs_set_header_level(eb, old_root->level);
1370                 btrfs_set_header_generation(eb, old_generation);
1371         }
1372         if (tm)
1373                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1374         else
1375                 WARN_ON(btrfs_header_level(eb) != 0);
1376         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1377
1378         return eb;
1379 }
1380
1381 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1382 {
1383         struct tree_mod_elem *tm;
1384         int level;
1385         struct extent_buffer *eb_root = btrfs_root_node(root);
1386
1387         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1388         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1389                 level = tm->old_root.level;
1390         } else {
1391                 level = btrfs_header_level(eb_root);
1392         }
1393         free_extent_buffer(eb_root);
1394
1395         return level;
1396 }
1397
1398 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1399                                    struct btrfs_root *root,
1400                                    struct extent_buffer *buf)
1401 {
1402         if (btrfs_is_testing(root->fs_info))
1403                 return 0;
1404
1405         /* Ensure we can see the FORCE_COW bit */
1406         smp_mb__before_atomic();
1407
1408         /*
1409          * We do not need to cow a block if
1410          * 1) this block is not created or changed in this transaction;
1411          * 2) this block does not belong to TREE_RELOC tree;
1412          * 3) the root is not forced COW.
1413          *
1414          * What is forced COW:
1415          *    when we create snapshot during committing the transaction,
1416          *    after we've finished coping src root, we must COW the shared
1417          *    block to ensure the metadata consistency.
1418          */
1419         if (btrfs_header_generation(buf) == trans->transid &&
1420             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1421             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1422               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1423             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1424                 return 0;
1425         return 1;
1426 }
1427
1428 /*
1429  * cows a single block, see __btrfs_cow_block for the real work.
1430  * This version of it has extra checks so that a block isn't COWed more than
1431  * once per transaction, as long as it hasn't been written yet
1432  */
1433 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1434                     struct btrfs_root *root, struct extent_buffer *buf,
1435                     struct extent_buffer *parent, int parent_slot,
1436                     struct extent_buffer **cow_ret)
1437 {
1438         struct btrfs_fs_info *fs_info = root->fs_info;
1439         u64 search_start;
1440         int ret;
1441
1442         if (trans->transaction != fs_info->running_transaction)
1443                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1444                        trans->transid,
1445                        fs_info->running_transaction->transid);
1446
1447         if (trans->transid != fs_info->generation)
1448                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1449                        trans->transid, fs_info->generation);
1450
1451         if (!should_cow_block(trans, root, buf)) {
1452                 trans->dirty = true;
1453                 *cow_ret = buf;
1454                 return 0;
1455         }
1456
1457         search_start = buf->start & ~((u64)SZ_1G - 1);
1458
1459         if (parent)
1460                 btrfs_set_lock_blocking(parent);
1461         btrfs_set_lock_blocking(buf);
1462
1463         ret = __btrfs_cow_block(trans, root, buf, parent,
1464                                  parent_slot, cow_ret, search_start, 0);
1465
1466         trace_btrfs_cow_block(root, buf, *cow_ret);
1467
1468         return ret;
1469 }
1470
1471 /*
1472  * helper function for defrag to decide if two blocks pointed to by a
1473  * node are actually close by
1474  */
1475 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1476 {
1477         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1478                 return 1;
1479         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1480                 return 1;
1481         return 0;
1482 }
1483
1484 /*
1485  * compare two keys in a memcmp fashion
1486  */
1487 static int comp_keys(const struct btrfs_disk_key *disk,
1488                      const struct btrfs_key *k2)
1489 {
1490         struct btrfs_key k1;
1491
1492         btrfs_disk_key_to_cpu(&k1, disk);
1493
1494         return btrfs_comp_cpu_keys(&k1, k2);
1495 }
1496
1497 /*
1498  * same as comp_keys only with two btrfs_key's
1499  */
1500 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1501 {
1502         if (k1->objectid > k2->objectid)
1503                 return 1;
1504         if (k1->objectid < k2->objectid)
1505                 return -1;
1506         if (k1->type > k2->type)
1507                 return 1;
1508         if (k1->type < k2->type)
1509                 return -1;
1510         if (k1->offset > k2->offset)
1511                 return 1;
1512         if (k1->offset < k2->offset)
1513                 return -1;
1514         return 0;
1515 }
1516
1517 /*
1518  * this is used by the defrag code to go through all the
1519  * leaves pointed to by a node and reallocate them so that
1520  * disk order is close to key order
1521  */
1522 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1523                        struct btrfs_root *root, struct extent_buffer *parent,
1524                        int start_slot, u64 *last_ret,
1525                        struct btrfs_key *progress)
1526 {
1527         struct btrfs_fs_info *fs_info = root->fs_info;
1528         struct extent_buffer *cur;
1529         u64 blocknr;
1530         u64 gen;
1531         u64 search_start = *last_ret;
1532         u64 last_block = 0;
1533         u64 other;
1534         u32 parent_nritems;
1535         int end_slot;
1536         int i;
1537         int err = 0;
1538         int parent_level;
1539         int uptodate;
1540         u32 blocksize;
1541         int progress_passed = 0;
1542         struct btrfs_disk_key disk_key;
1543
1544         parent_level = btrfs_header_level(parent);
1545
1546         WARN_ON(trans->transaction != fs_info->running_transaction);
1547         WARN_ON(trans->transid != fs_info->generation);
1548
1549         parent_nritems = btrfs_header_nritems(parent);
1550         blocksize = fs_info->nodesize;
1551         end_slot = parent_nritems - 1;
1552
1553         if (parent_nritems <= 1)
1554                 return 0;
1555
1556         btrfs_set_lock_blocking(parent);
1557
1558         for (i = start_slot; i <= end_slot; i++) {
1559                 struct btrfs_key first_key;
1560                 int close = 1;
1561
1562                 btrfs_node_key(parent, &disk_key, i);
1563                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1564                         continue;
1565
1566                 progress_passed = 1;
1567                 blocknr = btrfs_node_blockptr(parent, i);
1568                 gen = btrfs_node_ptr_generation(parent, i);
1569                 btrfs_node_key_to_cpu(parent, &first_key, i);
1570                 if (last_block == 0)
1571                         last_block = blocknr;
1572
1573                 if (i > 0) {
1574                         other = btrfs_node_blockptr(parent, i - 1);
1575                         close = close_blocks(blocknr, other, blocksize);
1576                 }
1577                 if (!close && i < end_slot) {
1578                         other = btrfs_node_blockptr(parent, i + 1);
1579                         close = close_blocks(blocknr, other, blocksize);
1580                 }
1581                 if (close) {
1582                         last_block = blocknr;
1583                         continue;
1584                 }
1585
1586                 cur = find_extent_buffer(fs_info, blocknr);
1587                 if (cur)
1588                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1589                 else
1590                         uptodate = 0;
1591                 if (!cur || !uptodate) {
1592                         if (!cur) {
1593                                 cur = read_tree_block(fs_info, blocknr, gen,
1594                                                       parent_level - 1,
1595                                                       &first_key);
1596                                 if (IS_ERR(cur)) {
1597                                         return PTR_ERR(cur);
1598                                 } else if (!extent_buffer_uptodate(cur)) {
1599                                         free_extent_buffer(cur);
1600                                         return -EIO;
1601                                 }
1602                         } else if (!uptodate) {
1603                                 err = btrfs_read_buffer(cur, gen,
1604                                                 parent_level - 1,&first_key);
1605                                 if (err) {
1606                                         free_extent_buffer(cur);
1607                                         return err;
1608                                 }
1609                         }
1610                 }
1611                 if (search_start == 0)
1612                         search_start = last_block;
1613
1614                 btrfs_tree_lock(cur);
1615                 btrfs_set_lock_blocking(cur);
1616                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1617                                         &cur, search_start,
1618                                         min(16 * blocksize,
1619                                             (end_slot - i) * blocksize));
1620                 if (err) {
1621                         btrfs_tree_unlock(cur);
1622                         free_extent_buffer(cur);
1623                         break;
1624                 }
1625                 search_start = cur->start;
1626                 last_block = cur->start;
1627                 *last_ret = search_start;
1628                 btrfs_tree_unlock(cur);
1629                 free_extent_buffer(cur);
1630         }
1631         return err;
1632 }
1633
1634 /*
1635  * search for key in the extent_buffer.  The items start at offset p,
1636  * and they are item_size apart.  There are 'max' items in p.
1637  *
1638  * the slot in the array is returned via slot, and it points to
1639  * the place where you would insert key if it is not found in
1640  * the array.
1641  *
1642  * slot may point to max if the key is bigger than all of the keys
1643  */
1644 static noinline int generic_bin_search(struct extent_buffer *eb,
1645                                        unsigned long p, int item_size,
1646                                        const struct btrfs_key *key,
1647                                        int max, int *slot)
1648 {
1649         int low = 0;
1650         int high = max;
1651         int mid;
1652         int ret;
1653         struct btrfs_disk_key *tmp = NULL;
1654         struct btrfs_disk_key unaligned;
1655         unsigned long offset;
1656         char *kaddr = NULL;
1657         unsigned long map_start = 0;
1658         unsigned long map_len = 0;
1659         int err;
1660
1661         if (low > high) {
1662                 btrfs_err(eb->fs_info,
1663                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1664                           __func__, low, high, eb->start,
1665                           btrfs_header_owner(eb), btrfs_header_level(eb));
1666                 return -EINVAL;
1667         }
1668
1669         while (low < high) {
1670                 mid = (low + high) / 2;
1671                 offset = p + mid * item_size;
1672
1673                 if (!kaddr || offset < map_start ||
1674                     (offset + sizeof(struct btrfs_disk_key)) >
1675                     map_start + map_len) {
1676
1677                         err = map_private_extent_buffer(eb, offset,
1678                                                 sizeof(struct btrfs_disk_key),
1679                                                 &kaddr, &map_start, &map_len);
1680
1681                         if (!err) {
1682                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1683                                                         map_start);
1684                         } else if (err == 1) {
1685                                 read_extent_buffer(eb, &unaligned,
1686                                                    offset, sizeof(unaligned));
1687                                 tmp = &unaligned;
1688                         } else {
1689                                 return err;
1690                         }
1691
1692                 } else {
1693                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1694                                                         map_start);
1695                 }
1696                 ret = comp_keys(tmp, key);
1697
1698                 if (ret < 0)
1699                         low = mid + 1;
1700                 else if (ret > 0)
1701                         high = mid;
1702                 else {
1703                         *slot = mid;
1704                         return 0;
1705                 }
1706         }
1707         *slot = low;
1708         return 1;
1709 }
1710
1711 /*
1712  * simple bin_search frontend that does the right thing for
1713  * leaves vs nodes
1714  */
1715 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1716                      int level, int *slot)
1717 {
1718         if (level == 0)
1719                 return generic_bin_search(eb,
1720                                           offsetof(struct btrfs_leaf, items),
1721                                           sizeof(struct btrfs_item),
1722                                           key, btrfs_header_nritems(eb),
1723                                           slot);
1724         else
1725                 return generic_bin_search(eb,
1726                                           offsetof(struct btrfs_node, ptrs),
1727                                           sizeof(struct btrfs_key_ptr),
1728                                           key, btrfs_header_nritems(eb),
1729                                           slot);
1730 }
1731
1732 static void root_add_used(struct btrfs_root *root, u32 size)
1733 {
1734         spin_lock(&root->accounting_lock);
1735         btrfs_set_root_used(&root->root_item,
1736                             btrfs_root_used(&root->root_item) + size);
1737         spin_unlock(&root->accounting_lock);
1738 }
1739
1740 static void root_sub_used(struct btrfs_root *root, u32 size)
1741 {
1742         spin_lock(&root->accounting_lock);
1743         btrfs_set_root_used(&root->root_item,
1744                             btrfs_root_used(&root->root_item) - size);
1745         spin_unlock(&root->accounting_lock);
1746 }
1747
1748 /* given a node and slot number, this reads the blocks it points to.  The
1749  * extent buffer is returned with a reference taken (but unlocked).
1750  */
1751 static noinline struct extent_buffer *
1752 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1753                int slot)
1754 {
1755         int level = btrfs_header_level(parent);
1756         struct extent_buffer *eb;
1757         struct btrfs_key first_key;
1758
1759         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1760                 return ERR_PTR(-ENOENT);
1761
1762         BUG_ON(level == 0);
1763
1764         btrfs_node_key_to_cpu(parent, &first_key, slot);
1765         eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1766                              btrfs_node_ptr_generation(parent, slot),
1767                              level - 1, &first_key);
1768         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1769                 free_extent_buffer(eb);
1770                 eb = ERR_PTR(-EIO);
1771         }
1772
1773         return eb;
1774 }
1775
1776 /*
1777  * node level balancing, used to make sure nodes are in proper order for
1778  * item deletion.  We balance from the top down, so we have to make sure
1779  * that a deletion won't leave an node completely empty later on.
1780  */
1781 static noinline int balance_level(struct btrfs_trans_handle *trans,
1782                          struct btrfs_root *root,
1783                          struct btrfs_path *path, int level)
1784 {
1785         struct btrfs_fs_info *fs_info = root->fs_info;
1786         struct extent_buffer *right = NULL;
1787         struct extent_buffer *mid;
1788         struct extent_buffer *left = NULL;
1789         struct extent_buffer *parent = NULL;
1790         int ret = 0;
1791         int wret;
1792         int pslot;
1793         int orig_slot = path->slots[level];
1794         u64 orig_ptr;
1795
1796         ASSERT(level > 0);
1797
1798         mid = path->nodes[level];
1799
1800         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1801                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1802         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1803
1804         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1805
1806         if (level < BTRFS_MAX_LEVEL - 1) {
1807                 parent = path->nodes[level + 1];
1808                 pslot = path->slots[level + 1];
1809         }
1810
1811         /*
1812          * deal with the case where there is only one pointer in the root
1813          * by promoting the node below to a root
1814          */
1815         if (!parent) {
1816                 struct extent_buffer *child;
1817
1818                 if (btrfs_header_nritems(mid) != 1)
1819                         return 0;
1820
1821                 /* promote the child to a root */
1822                 child = read_node_slot(fs_info, mid, 0);
1823                 if (IS_ERR(child)) {
1824                         ret = PTR_ERR(child);
1825                         btrfs_handle_fs_error(fs_info, ret, NULL);
1826                         goto enospc;
1827                 }
1828
1829                 btrfs_tree_lock(child);
1830                 btrfs_set_lock_blocking(child);
1831                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1832                 if (ret) {
1833                         btrfs_tree_unlock(child);
1834                         free_extent_buffer(child);
1835                         goto enospc;
1836                 }
1837
1838                 ret = tree_mod_log_insert_root(root->node, child, 1);
1839                 BUG_ON(ret < 0);
1840                 rcu_assign_pointer(root->node, child);
1841
1842                 add_root_to_dirty_list(root);
1843                 btrfs_tree_unlock(child);
1844
1845                 path->locks[level] = 0;
1846                 path->nodes[level] = NULL;
1847                 clean_tree_block(fs_info, mid);
1848                 btrfs_tree_unlock(mid);
1849                 /* once for the path */
1850                 free_extent_buffer(mid);
1851
1852                 root_sub_used(root, mid->len);
1853                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1854                 /* once for the root ptr */
1855                 free_extent_buffer_stale(mid);
1856                 return 0;
1857         }
1858         if (btrfs_header_nritems(mid) >
1859             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1860                 return 0;
1861
1862         left = read_node_slot(fs_info, parent, pslot - 1);
1863         if (IS_ERR(left))
1864                 left = NULL;
1865
1866         if (left) {
1867                 btrfs_tree_lock(left);
1868                 btrfs_set_lock_blocking(left);
1869                 wret = btrfs_cow_block(trans, root, left,
1870                                        parent, pslot - 1, &left);
1871                 if (wret) {
1872                         ret = wret;
1873                         goto enospc;
1874                 }
1875         }
1876
1877         right = read_node_slot(fs_info, parent, pslot + 1);
1878         if (IS_ERR(right))
1879                 right = NULL;
1880
1881         if (right) {
1882                 btrfs_tree_lock(right);
1883                 btrfs_set_lock_blocking(right);
1884                 wret = btrfs_cow_block(trans, root, right,
1885                                        parent, pslot + 1, &right);
1886                 if (wret) {
1887                         ret = wret;
1888                         goto enospc;
1889                 }
1890         }
1891
1892         /* first, try to make some room in the middle buffer */
1893         if (left) {
1894                 orig_slot += btrfs_header_nritems(left);
1895                 wret = push_node_left(trans, fs_info, left, mid, 1);
1896                 if (wret < 0)
1897                         ret = wret;
1898         }
1899
1900         /*
1901          * then try to empty the right most buffer into the middle
1902          */
1903         if (right) {
1904                 wret = push_node_left(trans, fs_info, mid, right, 1);
1905                 if (wret < 0 && wret != -ENOSPC)
1906                         ret = wret;
1907                 if (btrfs_header_nritems(right) == 0) {
1908                         clean_tree_block(fs_info, right);
1909                         btrfs_tree_unlock(right);
1910                         del_ptr(root, path, level + 1, pslot + 1);
1911                         root_sub_used(root, right->len);
1912                         btrfs_free_tree_block(trans, root, right, 0, 1);
1913                         free_extent_buffer_stale(right);
1914                         right = NULL;
1915                 } else {
1916                         struct btrfs_disk_key right_key;
1917                         btrfs_node_key(right, &right_key, 0);
1918                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1919                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1920                         BUG_ON(ret < 0);
1921                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1922                         btrfs_mark_buffer_dirty(parent);
1923                 }
1924         }
1925         if (btrfs_header_nritems(mid) == 1) {
1926                 /*
1927                  * we're not allowed to leave a node with one item in the
1928                  * tree during a delete.  A deletion from lower in the tree
1929                  * could try to delete the only pointer in this node.
1930                  * So, pull some keys from the left.
1931                  * There has to be a left pointer at this point because
1932                  * otherwise we would have pulled some pointers from the
1933                  * right
1934                  */
1935                 if (!left) {
1936                         ret = -EROFS;
1937                         btrfs_handle_fs_error(fs_info, ret, NULL);
1938                         goto enospc;
1939                 }
1940                 wret = balance_node_right(trans, fs_info, mid, left);
1941                 if (wret < 0) {
1942                         ret = wret;
1943                         goto enospc;
1944                 }
1945                 if (wret == 1) {
1946                         wret = push_node_left(trans, fs_info, left, mid, 1);
1947                         if (wret < 0)
1948                                 ret = wret;
1949                 }
1950                 BUG_ON(wret == 1);
1951         }
1952         if (btrfs_header_nritems(mid) == 0) {
1953                 clean_tree_block(fs_info, mid);
1954                 btrfs_tree_unlock(mid);
1955                 del_ptr(root, path, level + 1, pslot);
1956                 root_sub_used(root, mid->len);
1957                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1958                 free_extent_buffer_stale(mid);
1959                 mid = NULL;
1960         } else {
1961                 /* update the parent key to reflect our changes */
1962                 struct btrfs_disk_key mid_key;
1963                 btrfs_node_key(mid, &mid_key, 0);
1964                 ret = tree_mod_log_insert_key(parent, pslot,
1965                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1966                 BUG_ON(ret < 0);
1967                 btrfs_set_node_key(parent, &mid_key, pslot);
1968                 btrfs_mark_buffer_dirty(parent);
1969         }
1970
1971         /* update the path */
1972         if (left) {
1973                 if (btrfs_header_nritems(left) > orig_slot) {
1974                         extent_buffer_get(left);
1975                         /* left was locked after cow */
1976                         path->nodes[level] = left;
1977                         path->slots[level + 1] -= 1;
1978                         path->slots[level] = orig_slot;
1979                         if (mid) {
1980                                 btrfs_tree_unlock(mid);
1981                                 free_extent_buffer(mid);
1982                         }
1983                 } else {
1984                         orig_slot -= btrfs_header_nritems(left);
1985                         path->slots[level] = orig_slot;
1986                 }
1987         }
1988         /* double check we haven't messed things up */
1989         if (orig_ptr !=
1990             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1991                 BUG();
1992 enospc:
1993         if (right) {
1994                 btrfs_tree_unlock(right);
1995                 free_extent_buffer(right);
1996         }
1997         if (left) {
1998                 if (path->nodes[level] != left)
1999                         btrfs_tree_unlock(left);
2000                 free_extent_buffer(left);
2001         }
2002         return ret;
2003 }
2004
2005 /* Node balancing for insertion.  Here we only split or push nodes around
2006  * when they are completely full.  This is also done top down, so we
2007  * have to be pessimistic.
2008  */
2009 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2010                                           struct btrfs_root *root,
2011                                           struct btrfs_path *path, int level)
2012 {
2013         struct btrfs_fs_info *fs_info = root->fs_info;
2014         struct extent_buffer *right = NULL;
2015         struct extent_buffer *mid;
2016         struct extent_buffer *left = NULL;
2017         struct extent_buffer *parent = NULL;
2018         int ret = 0;
2019         int wret;
2020         int pslot;
2021         int orig_slot = path->slots[level];
2022
2023         if (level == 0)
2024                 return 1;
2025
2026         mid = path->nodes[level];
2027         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2028
2029         if (level < BTRFS_MAX_LEVEL - 1) {
2030                 parent = path->nodes[level + 1];
2031                 pslot = path->slots[level + 1];
2032         }
2033
2034         if (!parent)
2035                 return 1;
2036
2037         left = read_node_slot(fs_info, parent, pslot - 1);
2038         if (IS_ERR(left))
2039                 left = NULL;
2040
2041         /* first, try to make some room in the middle buffer */
2042         if (left) {
2043                 u32 left_nr;
2044
2045                 btrfs_tree_lock(left);
2046                 btrfs_set_lock_blocking(left);
2047
2048                 left_nr = btrfs_header_nritems(left);
2049                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2050                         wret = 1;
2051                 } else {
2052                         ret = btrfs_cow_block(trans, root, left, parent,
2053                                               pslot - 1, &left);
2054                         if (ret)
2055                                 wret = 1;
2056                         else {
2057                                 wret = push_node_left(trans, fs_info,
2058                                                       left, mid, 0);
2059                         }
2060                 }
2061                 if (wret < 0)
2062                         ret = wret;
2063                 if (wret == 0) {
2064                         struct btrfs_disk_key disk_key;
2065                         orig_slot += left_nr;
2066                         btrfs_node_key(mid, &disk_key, 0);
2067                         ret = tree_mod_log_insert_key(parent, pslot,
2068                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2069                         BUG_ON(ret < 0);
2070                         btrfs_set_node_key(parent, &disk_key, pslot);
2071                         btrfs_mark_buffer_dirty(parent);
2072                         if (btrfs_header_nritems(left) > orig_slot) {
2073                                 path->nodes[level] = left;
2074                                 path->slots[level + 1] -= 1;
2075                                 path->slots[level] = orig_slot;
2076                                 btrfs_tree_unlock(mid);
2077                                 free_extent_buffer(mid);
2078                         } else {
2079                                 orig_slot -=
2080                                         btrfs_header_nritems(left);
2081                                 path->slots[level] = orig_slot;
2082                                 btrfs_tree_unlock(left);
2083                                 free_extent_buffer(left);
2084                         }
2085                         return 0;
2086                 }
2087                 btrfs_tree_unlock(left);
2088                 free_extent_buffer(left);
2089         }
2090         right = read_node_slot(fs_info, parent, pslot + 1);
2091         if (IS_ERR(right))
2092                 right = NULL;
2093
2094         /*
2095          * then try to empty the right most buffer into the middle
2096          */
2097         if (right) {
2098                 u32 right_nr;
2099
2100                 btrfs_tree_lock(right);
2101                 btrfs_set_lock_blocking(right);
2102
2103                 right_nr = btrfs_header_nritems(right);
2104                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2105                         wret = 1;
2106                 } else {
2107                         ret = btrfs_cow_block(trans, root, right,
2108                                               parent, pslot + 1,
2109                                               &right);
2110                         if (ret)
2111                                 wret = 1;
2112                         else {
2113                                 wret = balance_node_right(trans, fs_info,
2114                                                           right, mid);
2115                         }
2116                 }
2117                 if (wret < 0)
2118                         ret = wret;
2119                 if (wret == 0) {
2120                         struct btrfs_disk_key disk_key;
2121
2122                         btrfs_node_key(right, &disk_key, 0);
2123                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2124                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2125                         BUG_ON(ret < 0);
2126                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2127                         btrfs_mark_buffer_dirty(parent);
2128
2129                         if (btrfs_header_nritems(mid) <= orig_slot) {
2130                                 path->nodes[level] = right;
2131                                 path->slots[level + 1] += 1;
2132                                 path->slots[level] = orig_slot -
2133                                         btrfs_header_nritems(mid);
2134                                 btrfs_tree_unlock(mid);
2135                                 free_extent_buffer(mid);
2136                         } else {
2137                                 btrfs_tree_unlock(right);
2138                                 free_extent_buffer(right);
2139                         }
2140                         return 0;
2141                 }
2142                 btrfs_tree_unlock(right);
2143                 free_extent_buffer(right);
2144         }
2145         return 1;
2146 }
2147
2148 /*
2149  * readahead one full node of leaves, finding things that are close
2150  * to the block in 'slot', and triggering ra on them.
2151  */
2152 static void reada_for_search(struct btrfs_fs_info *fs_info,
2153                              struct btrfs_path *path,
2154                              int level, int slot, u64 objectid)
2155 {
2156         struct extent_buffer *node;
2157         struct btrfs_disk_key disk_key;
2158         u32 nritems;
2159         u64 search;
2160         u64 target;
2161         u64 nread = 0;
2162         struct extent_buffer *eb;
2163         u32 nr;
2164         u32 blocksize;
2165         u32 nscan = 0;
2166
2167         if (level != 1)
2168                 return;
2169
2170         if (!path->nodes[level])
2171                 return;
2172
2173         node = path->nodes[level];
2174
2175         search = btrfs_node_blockptr(node, slot);
2176         blocksize = fs_info->nodesize;
2177         eb = find_extent_buffer(fs_info, search);
2178         if (eb) {
2179                 free_extent_buffer(eb);
2180                 return;
2181         }
2182
2183         target = search;
2184
2185         nritems = btrfs_header_nritems(node);
2186         nr = slot;
2187
2188         while (1) {
2189                 if (path->reada == READA_BACK) {
2190                         if (nr == 0)
2191                                 break;
2192                         nr--;
2193                 } else if (path->reada == READA_FORWARD) {
2194                         nr++;
2195                         if (nr >= nritems)
2196                                 break;
2197                 }
2198                 if (path->reada == READA_BACK && objectid) {
2199                         btrfs_node_key(node, &disk_key, nr);
2200                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2201                                 break;
2202                 }
2203                 search = btrfs_node_blockptr(node, nr);
2204                 if ((search <= target && target - search <= 65536) ||
2205                     (search > target && search - target <= 65536)) {
2206                         readahead_tree_block(fs_info, search);
2207                         nread += blocksize;
2208                 }
2209                 nscan++;
2210                 if ((nread > 65536 || nscan > 32))
2211                         break;
2212         }
2213 }
2214
2215 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2216                                        struct btrfs_path *path, int level)
2217 {
2218         int slot;
2219         int nritems;
2220         struct extent_buffer *parent;
2221         struct extent_buffer *eb;
2222         u64 gen;
2223         u64 block1 = 0;
2224         u64 block2 = 0;
2225
2226         parent = path->nodes[level + 1];
2227         if (!parent)
2228                 return;
2229
2230         nritems = btrfs_header_nritems(parent);
2231         slot = path->slots[level + 1];
2232
2233         if (slot > 0) {
2234                 block1 = btrfs_node_blockptr(parent, slot - 1);
2235                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2236                 eb = find_extent_buffer(fs_info, block1);
2237                 /*
2238                  * if we get -eagain from btrfs_buffer_uptodate, we
2239                  * don't want to return eagain here.  That will loop
2240                  * forever
2241                  */
2242                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2243                         block1 = 0;
2244                 free_extent_buffer(eb);
2245         }
2246         if (slot + 1 < nritems) {
2247                 block2 = btrfs_node_blockptr(parent, slot + 1);
2248                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2249                 eb = find_extent_buffer(fs_info, block2);
2250                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2251                         block2 = 0;
2252                 free_extent_buffer(eb);
2253         }
2254
2255         if (block1)
2256                 readahead_tree_block(fs_info, block1);
2257         if (block2)
2258                 readahead_tree_block(fs_info, block2);
2259 }
2260
2261
2262 /*
2263  * when we walk down the tree, it is usually safe to unlock the higher layers
2264  * in the tree.  The exceptions are when our path goes through slot 0, because
2265  * operations on the tree might require changing key pointers higher up in the
2266  * tree.
2267  *
2268  * callers might also have set path->keep_locks, which tells this code to keep
2269  * the lock if the path points to the last slot in the block.  This is part of
2270  * walking through the tree, and selecting the next slot in the higher block.
2271  *
2272  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2273  * if lowest_unlock is 1, level 0 won't be unlocked
2274  */
2275 static noinline void unlock_up(struct btrfs_path *path, int level,
2276                                int lowest_unlock, int min_write_lock_level,
2277                                int *write_lock_level)
2278 {
2279         int i;
2280         int skip_level = level;
2281         int no_skips = 0;
2282         struct extent_buffer *t;
2283
2284         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2285                 if (!path->nodes[i])
2286                         break;
2287                 if (!path->locks[i])
2288                         break;
2289                 if (!no_skips && path->slots[i] == 0) {
2290                         skip_level = i + 1;
2291                         continue;
2292                 }
2293                 if (!no_skips && path->keep_locks) {
2294                         u32 nritems;
2295                         t = path->nodes[i];
2296                         nritems = btrfs_header_nritems(t);
2297                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2298                                 skip_level = i + 1;
2299                                 continue;
2300                         }
2301                 }
2302                 if (skip_level < i && i >= lowest_unlock)
2303                         no_skips = 1;
2304
2305                 t = path->nodes[i];
2306                 if (i >= lowest_unlock && i > skip_level) {
2307                         btrfs_tree_unlock_rw(t, path->locks[i]);
2308                         path->locks[i] = 0;
2309                         if (write_lock_level &&
2310                             i > min_write_lock_level &&
2311                             i <= *write_lock_level) {
2312                                 *write_lock_level = i - 1;
2313                         }
2314                 }
2315         }
2316 }
2317
2318 /*
2319  * This releases any locks held in the path starting at level and
2320  * going all the way up to the root.
2321  *
2322  * btrfs_search_slot will keep the lock held on higher nodes in a few
2323  * corner cases, such as COW of the block at slot zero in the node.  This
2324  * ignores those rules, and it should only be called when there are no
2325  * more updates to be done higher up in the tree.
2326  */
2327 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2328 {
2329         int i;
2330
2331         if (path->keep_locks)
2332                 return;
2333
2334         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2335                 if (!path->nodes[i])
2336                         continue;
2337                 if (!path->locks[i])
2338                         continue;
2339                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2340                 path->locks[i] = 0;
2341         }
2342 }
2343
2344 /*
2345  * helper function for btrfs_search_slot.  The goal is to find a block
2346  * in cache without setting the path to blocking.  If we find the block
2347  * we return zero and the path is unchanged.
2348  *
2349  * If we can't find the block, we set the path blocking and do some
2350  * reada.  -EAGAIN is returned and the search must be repeated.
2351  */
2352 static int
2353 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2354                       struct extent_buffer **eb_ret, int level, int slot,
2355                       const struct btrfs_key *key)
2356 {
2357         struct btrfs_fs_info *fs_info = root->fs_info;
2358         u64 blocknr;
2359         u64 gen;
2360         struct extent_buffer *b = *eb_ret;
2361         struct extent_buffer *tmp;
2362         struct btrfs_key first_key;
2363         int ret;
2364         int parent_level;
2365
2366         blocknr = btrfs_node_blockptr(b, slot);
2367         gen = btrfs_node_ptr_generation(b, slot);
2368         parent_level = btrfs_header_level(b);
2369         btrfs_node_key_to_cpu(b, &first_key, slot);
2370
2371         tmp = find_extent_buffer(fs_info, blocknr);
2372         if (tmp) {
2373                 /* first we do an atomic uptodate check */
2374                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2375                         *eb_ret = tmp;
2376                         return 0;
2377                 }
2378
2379                 /* the pages were up to date, but we failed
2380                  * the generation number check.  Do a full
2381                  * read for the generation number that is correct.
2382                  * We must do this without dropping locks so
2383                  * we can trust our generation number
2384                  */
2385                 btrfs_set_path_blocking(p);
2386
2387                 /* now we're allowed to do a blocking uptodate check */
2388                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2389                 if (!ret) {
2390                         *eb_ret = tmp;
2391                         return 0;
2392                 }
2393                 free_extent_buffer(tmp);
2394                 btrfs_release_path(p);
2395                 return -EIO;
2396         }
2397
2398         /*
2399          * reduce lock contention at high levels
2400          * of the btree by dropping locks before
2401          * we read.  Don't release the lock on the current
2402          * level because we need to walk this node to figure
2403          * out which blocks to read.
2404          */
2405         btrfs_unlock_up_safe(p, level + 1);
2406         btrfs_set_path_blocking(p);
2407
2408         if (p->reada != READA_NONE)
2409                 reada_for_search(fs_info, p, level, slot, key->objectid);
2410
2411         ret = -EAGAIN;
2412         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2413                               &first_key);
2414         if (!IS_ERR(tmp)) {
2415                 /*
2416                  * If the read above didn't mark this buffer up to date,
2417                  * it will never end up being up to date.  Set ret to EIO now
2418                  * and give up so that our caller doesn't loop forever
2419                  * on our EAGAINs.
2420                  */
2421                 if (!extent_buffer_uptodate(tmp))
2422                         ret = -EIO;
2423                 free_extent_buffer(tmp);
2424         } else {
2425                 ret = PTR_ERR(tmp);
2426         }
2427
2428         btrfs_release_path(p);
2429         return ret;
2430 }
2431
2432 /*
2433  * helper function for btrfs_search_slot.  This does all of the checks
2434  * for node-level blocks and does any balancing required based on
2435  * the ins_len.
2436  *
2437  * If no extra work was required, zero is returned.  If we had to
2438  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2439  * start over
2440  */
2441 static int
2442 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2443                        struct btrfs_root *root, struct btrfs_path *p,
2444                        struct extent_buffer *b, int level, int ins_len,
2445                        int *write_lock_level)
2446 {
2447         struct btrfs_fs_info *fs_info = root->fs_info;
2448         int ret;
2449
2450         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2451             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2452                 int sret;
2453
2454                 if (*write_lock_level < level + 1) {
2455                         *write_lock_level = level + 1;
2456                         btrfs_release_path(p);
2457                         goto again;
2458                 }
2459
2460                 btrfs_set_path_blocking(p);
2461                 reada_for_balance(fs_info, p, level);
2462                 sret = split_node(trans, root, p, level);
2463
2464                 BUG_ON(sret > 0);
2465                 if (sret) {
2466                         ret = sret;
2467                         goto done;
2468                 }
2469                 b = p->nodes[level];
2470         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2471                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2472                 int sret;
2473
2474                 if (*write_lock_level < level + 1) {
2475                         *write_lock_level = level + 1;
2476                         btrfs_release_path(p);
2477                         goto again;
2478                 }
2479
2480                 btrfs_set_path_blocking(p);
2481                 reada_for_balance(fs_info, p, level);
2482                 sret = balance_level(trans, root, p, level);
2483
2484                 if (sret) {
2485                         ret = sret;
2486                         goto done;
2487                 }
2488                 b = p->nodes[level];
2489                 if (!b) {
2490                         btrfs_release_path(p);
2491                         goto again;
2492                 }
2493                 BUG_ON(btrfs_header_nritems(b) == 1);
2494         }
2495         return 0;
2496
2497 again:
2498         ret = -EAGAIN;
2499 done:
2500         return ret;
2501 }
2502
2503 static void key_search_validate(struct extent_buffer *b,
2504                                 const struct btrfs_key *key,
2505                                 int level)
2506 {
2507 #ifdef CONFIG_BTRFS_ASSERT
2508         struct btrfs_disk_key disk_key;
2509
2510         btrfs_cpu_key_to_disk(&disk_key, key);
2511
2512         if (level == 0)
2513                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2514                     offsetof(struct btrfs_leaf, items[0].key),
2515                     sizeof(disk_key)));
2516         else
2517                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2518                     offsetof(struct btrfs_node, ptrs[0].key),
2519                     sizeof(disk_key)));
2520 #endif
2521 }
2522
2523 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2524                       int level, int *prev_cmp, int *slot)
2525 {
2526         if (*prev_cmp != 0) {
2527                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2528                 return *prev_cmp;
2529         }
2530
2531         key_search_validate(b, key, level);
2532         *slot = 0;
2533
2534         return 0;
2535 }
2536
2537 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2538                 u64 iobjectid, u64 ioff, u8 key_type,
2539                 struct btrfs_key *found_key)
2540 {
2541         int ret;
2542         struct btrfs_key key;
2543         struct extent_buffer *eb;
2544
2545         ASSERT(path);
2546         ASSERT(found_key);
2547
2548         key.type = key_type;
2549         key.objectid = iobjectid;
2550         key.offset = ioff;
2551
2552         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2553         if (ret < 0)
2554                 return ret;
2555
2556         eb = path->nodes[0];
2557         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2558                 ret = btrfs_next_leaf(fs_root, path);
2559                 if (ret)
2560                         return ret;
2561                 eb = path->nodes[0];
2562         }
2563
2564         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2565         if (found_key->type != key.type ||
2566                         found_key->objectid != key.objectid)
2567                 return 1;
2568
2569         return 0;
2570 }
2571
2572 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2573                                                         struct btrfs_path *p,
2574                                                         int write_lock_level)
2575 {
2576         struct btrfs_fs_info *fs_info = root->fs_info;
2577         struct extent_buffer *b;
2578         int root_lock;
2579         int level = 0;
2580
2581         /* We try very hard to do read locks on the root */
2582         root_lock = BTRFS_READ_LOCK;
2583
2584         if (p->search_commit_root) {
2585                 /* The commit roots are read only so we always do read locks */
2586                 if (p->need_commit_sem)
2587                         down_read(&fs_info->commit_root_sem);
2588                 b = root->commit_root;
2589                 extent_buffer_get(b);
2590                 level = btrfs_header_level(b);
2591                 if (p->need_commit_sem)
2592                         up_read(&fs_info->commit_root_sem);
2593                 /*
2594                  * Ensure that all callers have set skip_locking when
2595                  * p->search_commit_root = 1.
2596                  */
2597                 ASSERT(p->skip_locking == 1);
2598
2599                 goto out;
2600         }
2601
2602         if (p->skip_locking) {
2603                 b = btrfs_root_node(root);
2604                 level = btrfs_header_level(b);
2605                 goto out;
2606         }
2607
2608         /*
2609          * If the level is set to maximum, we can skip trying to get the read
2610          * lock.
2611          */
2612         if (write_lock_level < BTRFS_MAX_LEVEL) {
2613                 /*
2614                  * We don't know the level of the root node until we actually
2615                  * have it read locked
2616                  */
2617                 b = btrfs_read_lock_root_node(root);
2618                 level = btrfs_header_level(b);
2619                 if (level > write_lock_level)
2620                         goto out;
2621
2622                 /* Whoops, must trade for write lock */
2623                 btrfs_tree_read_unlock(b);
2624                 free_extent_buffer(b);
2625         }
2626
2627         b = btrfs_lock_root_node(root);
2628         root_lock = BTRFS_WRITE_LOCK;
2629
2630         /* The level might have changed, check again */
2631         level = btrfs_header_level(b);
2632
2633 out:
2634         p->nodes[level] = b;
2635         if (!p->skip_locking)
2636                 p->locks[level] = root_lock;
2637         /*
2638          * Callers are responsible for dropping b's references.
2639          */
2640         return b;
2641 }
2642
2643
2644 /*
2645  * btrfs_search_slot - look for a key in a tree and perform necessary
2646  * modifications to preserve tree invariants.
2647  *
2648  * @trans:      Handle of transaction, used when modifying the tree
2649  * @p:          Holds all btree nodes along the search path
2650  * @root:       The root node of the tree
2651  * @key:        The key we are looking for
2652  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2653  *              deletions it's -1. 0 for plain searches
2654  * @cow:        boolean should CoW operations be performed. Must always be 1
2655  *              when modifying the tree.
2656  *
2657  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2658  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2659  *
2660  * If @key is found, 0 is returned and you can find the item in the leaf level
2661  * of the path (level 0)
2662  *
2663  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2664  * points to the slot where it should be inserted
2665  *
2666  * If an error is encountered while searching the tree a negative error number
2667  * is returned
2668  */
2669 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2670                       const struct btrfs_key *key, struct btrfs_path *p,
2671                       int ins_len, int cow)
2672 {
2673         struct btrfs_fs_info *fs_info = root->fs_info;
2674         struct extent_buffer *b;
2675         int slot;
2676         int ret;
2677         int err;
2678         int level;
2679         int lowest_unlock = 1;
2680         /* everything at write_lock_level or lower must be write locked */
2681         int write_lock_level = 0;
2682         u8 lowest_level = 0;
2683         int min_write_lock_level;
2684         int prev_cmp;
2685
2686         lowest_level = p->lowest_level;
2687         WARN_ON(lowest_level && ins_len > 0);
2688         WARN_ON(p->nodes[0] != NULL);
2689         BUG_ON(!cow && ins_len);
2690
2691         if (ins_len < 0) {
2692                 lowest_unlock = 2;
2693
2694                 /* when we are removing items, we might have to go up to level
2695                  * two as we update tree pointers  Make sure we keep write
2696                  * for those levels as well
2697                  */
2698                 write_lock_level = 2;
2699         } else if (ins_len > 0) {
2700                 /*
2701                  * for inserting items, make sure we have a write lock on
2702                  * level 1 so we can update keys
2703                  */
2704                 write_lock_level = 1;
2705         }
2706
2707         if (!cow)
2708                 write_lock_level = -1;
2709
2710         if (cow && (p->keep_locks || p->lowest_level))
2711                 write_lock_level = BTRFS_MAX_LEVEL;
2712
2713         min_write_lock_level = write_lock_level;
2714
2715 again:
2716         prev_cmp = -1;
2717         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2718
2719         while (b) {
2720                 level = btrfs_header_level(b);
2721
2722                 /*
2723                  * setup the path here so we can release it under lock
2724                  * contention with the cow code
2725                  */
2726                 if (cow) {
2727                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2728
2729                         /*
2730                          * if we don't really need to cow this block
2731                          * then we don't want to set the path blocking,
2732                          * so we test it here
2733                          */
2734                         if (!should_cow_block(trans, root, b)) {
2735                                 trans->dirty = true;
2736                                 goto cow_done;
2737                         }
2738
2739                         /*
2740                          * must have write locks on this node and the
2741                          * parent
2742                          */
2743                         if (level > write_lock_level ||
2744                             (level + 1 > write_lock_level &&
2745                             level + 1 < BTRFS_MAX_LEVEL &&
2746                             p->nodes[level + 1])) {
2747                                 write_lock_level = level + 1;
2748                                 btrfs_release_path(p);
2749                                 goto again;
2750                         }
2751
2752                         btrfs_set_path_blocking(p);
2753                         if (last_level)
2754                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2755                                                       &b);
2756                         else
2757                                 err = btrfs_cow_block(trans, root, b,
2758                                                       p->nodes[level + 1],
2759                                                       p->slots[level + 1], &b);
2760                         if (err) {
2761                                 ret = err;
2762                                 goto done;
2763                         }
2764                 }
2765 cow_done:
2766                 p->nodes[level] = b;
2767                 /*
2768                  * Leave path with blocking locks to avoid massive
2769                  * lock context switch, this is made on purpose.
2770                  */
2771
2772                 /*
2773                  * we have a lock on b and as long as we aren't changing
2774                  * the tree, there is no way to for the items in b to change.
2775                  * It is safe to drop the lock on our parent before we
2776                  * go through the expensive btree search on b.
2777                  *
2778                  * If we're inserting or deleting (ins_len != 0), then we might
2779                  * be changing slot zero, which may require changing the parent.
2780                  * So, we can't drop the lock until after we know which slot
2781                  * we're operating on.
2782                  */
2783                 if (!ins_len && !p->keep_locks) {
2784                         int u = level + 1;
2785
2786                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2787                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2788                                 p->locks[u] = 0;
2789                         }
2790                 }
2791
2792                 ret = key_search(b, key, level, &prev_cmp, &slot);
2793                 if (ret < 0)
2794                         goto done;
2795
2796                 if (level != 0) {
2797                         int dec = 0;
2798                         if (ret && slot > 0) {
2799                                 dec = 1;
2800                                 slot -= 1;
2801                         }
2802                         p->slots[level] = slot;
2803                         err = setup_nodes_for_search(trans, root, p, b, level,
2804                                              ins_len, &write_lock_level);
2805                         if (err == -EAGAIN)
2806                                 goto again;
2807                         if (err) {
2808                                 ret = err;
2809                                 goto done;
2810                         }
2811                         b = p->nodes[level];
2812                         slot = p->slots[level];
2813
2814                         /*
2815                          * slot 0 is special, if we change the key
2816                          * we have to update the parent pointer
2817                          * which means we must have a write lock
2818                          * on the parent
2819                          */
2820                         if (slot == 0 && ins_len &&
2821                             write_lock_level < level + 1) {
2822                                 write_lock_level = level + 1;
2823                                 btrfs_release_path(p);
2824                                 goto again;
2825                         }
2826
2827                         unlock_up(p, level, lowest_unlock,
2828                                   min_write_lock_level, &write_lock_level);
2829
2830                         if (level == lowest_level) {
2831                                 if (dec)
2832                                         p->slots[level]++;
2833                                 goto done;
2834                         }
2835
2836                         err = read_block_for_search(root, p, &b, level,
2837                                                     slot, key);
2838                         if (err == -EAGAIN)
2839                                 goto again;
2840                         if (err) {
2841                                 ret = err;
2842                                 goto done;
2843                         }
2844
2845                         if (!p->skip_locking) {
2846                                 level = btrfs_header_level(b);
2847                                 if (level <= write_lock_level) {
2848                                         err = btrfs_try_tree_write_lock(b);
2849                                         if (!err) {
2850                                                 btrfs_set_path_blocking(p);
2851                                                 btrfs_tree_lock(b);
2852                                         }
2853                                         p->locks[level] = BTRFS_WRITE_LOCK;
2854                                 } else {
2855                                         err = btrfs_tree_read_lock_atomic(b);
2856                                         if (!err) {
2857                                                 btrfs_set_path_blocking(p);
2858                                                 btrfs_tree_read_lock(b);
2859                                         }
2860                                         p->locks[level] = BTRFS_READ_LOCK;
2861                                 }
2862                                 p->nodes[level] = b;
2863                         }
2864                 } else {
2865                         p->slots[level] = slot;
2866                         if (ins_len > 0 &&
2867                             btrfs_leaf_free_space(fs_info, b) < ins_len) {
2868                                 if (write_lock_level < 1) {
2869                                         write_lock_level = 1;
2870                                         btrfs_release_path(p);
2871                                         goto again;
2872                                 }
2873
2874                                 btrfs_set_path_blocking(p);
2875                                 err = split_leaf(trans, root, key,
2876                                                  p, ins_len, ret == 0);
2877
2878                                 BUG_ON(err > 0);
2879                                 if (err) {
2880                                         ret = err;
2881                                         goto done;
2882                                 }
2883                         }
2884                         if (!p->search_for_split)
2885                                 unlock_up(p, level, lowest_unlock,
2886                                           min_write_lock_level, NULL);
2887                         goto done;
2888                 }
2889         }
2890         ret = 1;
2891 done:
2892         /*
2893          * we don't really know what they plan on doing with the path
2894          * from here on, so for now just mark it as blocking
2895          */
2896         if (!p->leave_spinning)
2897                 btrfs_set_path_blocking(p);
2898         if (ret < 0 && !p->skip_release_on_error)
2899                 btrfs_release_path(p);
2900         return ret;
2901 }
2902
2903 /*
2904  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2905  * current state of the tree together with the operations recorded in the tree
2906  * modification log to search for the key in a previous version of this tree, as
2907  * denoted by the time_seq parameter.
2908  *
2909  * Naturally, there is no support for insert, delete or cow operations.
2910  *
2911  * The resulting path and return value will be set up as if we called
2912  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2913  */
2914 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2915                           struct btrfs_path *p, u64 time_seq)
2916 {
2917         struct btrfs_fs_info *fs_info = root->fs_info;
2918         struct extent_buffer *b;
2919         int slot;
2920         int ret;
2921         int err;
2922         int level;
2923         int lowest_unlock = 1;
2924         u8 lowest_level = 0;
2925         int prev_cmp = -1;
2926
2927         lowest_level = p->lowest_level;
2928         WARN_ON(p->nodes[0] != NULL);
2929
2930         if (p->search_commit_root) {
2931                 BUG_ON(time_seq);
2932                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2933         }
2934
2935 again:
2936         b = get_old_root(root, time_seq);
2937         if (!b) {
2938                 ret = -EIO;
2939                 goto done;
2940         }
2941         level = btrfs_header_level(b);
2942         p->locks[level] = BTRFS_READ_LOCK;
2943
2944         while (b) {
2945                 level = btrfs_header_level(b);
2946                 p->nodes[level] = b;
2947
2948                 /*
2949                  * we have a lock on b and as long as we aren't changing
2950                  * the tree, there is no way to for the items in b to change.
2951                  * It is safe to drop the lock on our parent before we
2952                  * go through the expensive btree search on b.
2953                  */
2954                 btrfs_unlock_up_safe(p, level + 1);
2955
2956                 /*
2957                  * Since we can unwind ebs we want to do a real search every
2958                  * time.
2959                  */
2960                 prev_cmp = -1;
2961                 ret = key_search(b, key, level, &prev_cmp, &slot);
2962
2963                 if (level != 0) {
2964                         int dec = 0;
2965                         if (ret && slot > 0) {
2966                                 dec = 1;
2967                                 slot -= 1;
2968                         }
2969                         p->slots[level] = slot;
2970                         unlock_up(p, level, lowest_unlock, 0, NULL);
2971
2972                         if (level == lowest_level) {
2973                                 if (dec)
2974                                         p->slots[level]++;
2975                                 goto done;
2976                         }
2977
2978                         err = read_block_for_search(root, p, &b, level,
2979                                                     slot, key);
2980                         if (err == -EAGAIN)
2981                                 goto again;
2982                         if (err) {
2983                                 ret = err;
2984                                 goto done;
2985                         }
2986
2987                         level = btrfs_header_level(b);
2988                         err = btrfs_tree_read_lock_atomic(b);
2989                         if (!err) {
2990                                 btrfs_set_path_blocking(p);
2991                                 btrfs_tree_read_lock(b);
2992                         }
2993                         b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2994                         if (!b) {
2995                                 ret = -ENOMEM;
2996                                 goto done;
2997                         }
2998                         p->locks[level] = BTRFS_READ_LOCK;
2999                         p->nodes[level] = b;
3000                 } else {
3001                         p->slots[level] = slot;
3002                         unlock_up(p, level, lowest_unlock, 0, NULL);
3003                         goto done;
3004                 }
3005         }
3006         ret = 1;
3007 done:
3008         if (!p->leave_spinning)
3009                 btrfs_set_path_blocking(p);
3010         if (ret < 0)
3011                 btrfs_release_path(p);
3012
3013         return ret;
3014 }
3015
3016 /*
3017  * helper to use instead of search slot if no exact match is needed but
3018  * instead the next or previous item should be returned.
3019  * When find_higher is true, the next higher item is returned, the next lower
3020  * otherwise.
3021  * When return_any and find_higher are both true, and no higher item is found,
3022  * return the next lower instead.
3023  * When return_any is true and find_higher is false, and no lower item is found,
3024  * return the next higher instead.
3025  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3026  * < 0 on error
3027  */
3028 int btrfs_search_slot_for_read(struct btrfs_root *root,
3029                                const struct btrfs_key *key,
3030                                struct btrfs_path *p, int find_higher,
3031                                int return_any)
3032 {
3033         int ret;
3034         struct extent_buffer *leaf;
3035
3036 again:
3037         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3038         if (ret <= 0)
3039                 return ret;
3040         /*
3041          * a return value of 1 means the path is at the position where the
3042          * item should be inserted. Normally this is the next bigger item,
3043          * but in case the previous item is the last in a leaf, path points
3044          * to the first free slot in the previous leaf, i.e. at an invalid
3045          * item.
3046          */
3047         leaf = p->nodes[0];
3048
3049         if (find_higher) {
3050                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3051                         ret = btrfs_next_leaf(root, p);
3052                         if (ret <= 0)
3053                                 return ret;
3054                         if (!return_any)
3055                                 return 1;
3056                         /*
3057                          * no higher item found, return the next
3058                          * lower instead
3059                          */
3060                         return_any = 0;
3061                         find_higher = 0;
3062                         btrfs_release_path(p);
3063                         goto again;
3064                 }
3065         } else {
3066                 if (p->slots[0] == 0) {
3067                         ret = btrfs_prev_leaf(root, p);
3068                         if (ret < 0)
3069                                 return ret;
3070                         if (!ret) {
3071                                 leaf = p->nodes[0];
3072                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3073                                         p->slots[0]--;
3074                                 return 0;
3075                         }
3076                         if (!return_any)
3077                                 return 1;
3078                         /*
3079                          * no lower item found, return the next
3080                          * higher instead
3081                          */
3082                         return_any = 0;
3083                         find_higher = 1;
3084                         btrfs_release_path(p);
3085                         goto again;
3086                 } else {
3087                         --p->slots[0];
3088                 }
3089         }
3090         return 0;
3091 }
3092
3093 /*
3094  * adjust the pointers going up the tree, starting at level
3095  * making sure the right key of each node is points to 'key'.
3096  * This is used after shifting pointers to the left, so it stops
3097  * fixing up pointers when a given leaf/node is not in slot 0 of the
3098  * higher levels
3099  *
3100  */
3101 static void fixup_low_keys(struct btrfs_path *path,
3102                            struct btrfs_disk_key *key, int level)
3103 {
3104         int i;
3105         struct extent_buffer *t;
3106         int ret;
3107
3108         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3109                 int tslot = path->slots[i];
3110
3111                 if (!path->nodes[i])
3112                         break;
3113                 t = path->nodes[i];
3114                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3115                                 GFP_ATOMIC);
3116                 BUG_ON(ret < 0);
3117                 btrfs_set_node_key(t, key, tslot);
3118                 btrfs_mark_buffer_dirty(path->nodes[i]);
3119                 if (tslot != 0)
3120                         break;
3121         }
3122 }
3123
3124 /*
3125  * update item key.
3126  *
3127  * This function isn't completely safe. It's the caller's responsibility
3128  * that the new key won't break the order
3129  */
3130 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3131                              struct btrfs_path *path,
3132                              const struct btrfs_key *new_key)
3133 {
3134         struct btrfs_disk_key disk_key;
3135         struct extent_buffer *eb;
3136         int slot;
3137
3138         eb = path->nodes[0];
3139