539901fb516503a5767fadd3bd278447a9263800
[sfrench/cifs-2.6.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15
16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
17                       *root, struct btrfs_path *path, int level);
18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
19                       const struct btrfs_key *ins_key, struct btrfs_path *path,
20                       int data_size, int extend);
21 static int push_node_left(struct btrfs_trans_handle *trans,
22                           struct btrfs_fs_info *fs_info,
23                           struct extent_buffer *dst,
24                           struct extent_buffer *src, int empty);
25 static int balance_node_right(struct btrfs_trans_handle *trans,
26                               struct btrfs_fs_info *fs_info,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 struct btrfs_path *btrfs_alloc_path(void)
33 {
34         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35 }
36
37 /*
38  * set all locked nodes in the path to blocking locks.  This should
39  * be done before scheduling
40  */
41 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42 {
43         int i;
44         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45                 if (!p->nodes[i] || !p->locks[i])
46                         continue;
47                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48                 if (p->locks[i] == BTRFS_READ_LOCK)
49                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
51                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52         }
53 }
54
55 /* this also releases the path */
56 void btrfs_free_path(struct btrfs_path *p)
57 {
58         if (!p)
59                 return;
60         btrfs_release_path(p);
61         kmem_cache_free(btrfs_path_cachep, p);
62 }
63
64 /*
65  * path release drops references on the extent buffers in the path
66  * and it drops any locks held by this path
67  *
68  * It is safe to call this on paths that no locks or extent buffers held.
69  */
70 noinline void btrfs_release_path(struct btrfs_path *p)
71 {
72         int i;
73
74         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75                 p->slots[i] = 0;
76                 if (!p->nodes[i])
77                         continue;
78                 if (p->locks[i]) {
79                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
80                         p->locks[i] = 0;
81                 }
82                 free_extent_buffer(p->nodes[i]);
83                 p->nodes[i] = NULL;
84         }
85 }
86
87 /*
88  * safely gets a reference on the root node of a tree.  A lock
89  * is not taken, so a concurrent writer may put a different node
90  * at the root of the tree.  See btrfs_lock_root_node for the
91  * looping required.
92  *
93  * The extent buffer returned by this has a reference taken, so
94  * it won't disappear.  It may stop being the root of the tree
95  * at any time because there are no locks held.
96  */
97 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98 {
99         struct extent_buffer *eb;
100
101         while (1) {
102                 rcu_read_lock();
103                 eb = rcu_dereference(root->node);
104
105                 /*
106                  * RCU really hurts here, we could free up the root node because
107                  * it was COWed but we may not get the new root node yet so do
108                  * the inc_not_zero dance and if it doesn't work then
109                  * synchronize_rcu and try again.
110                  */
111                 if (atomic_inc_not_zero(&eb->refs)) {
112                         rcu_read_unlock();
113                         break;
114                 }
115                 rcu_read_unlock();
116                 synchronize_rcu();
117         }
118         return eb;
119 }
120
121 /* loop around taking references on and locking the root node of the
122  * tree until you end up with a lock on the root.  A locked buffer
123  * is returned, with a reference held.
124  */
125 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
126 {
127         struct extent_buffer *eb;
128
129         while (1) {
130                 eb = btrfs_root_node(root);
131                 btrfs_tree_lock(eb);
132                 if (eb == root->node)
133                         break;
134                 btrfs_tree_unlock(eb);
135                 free_extent_buffer(eb);
136         }
137         return eb;
138 }
139
140 /* loop around taking references on and locking the root node of the
141  * tree until you end up with a lock on the root.  A locked buffer
142  * is returned, with a reference held.
143  */
144 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
145 {
146         struct extent_buffer *eb;
147
148         while (1) {
149                 eb = btrfs_root_node(root);
150                 btrfs_tree_read_lock(eb);
151                 if (eb == root->node)
152                         break;
153                 btrfs_tree_read_unlock(eb);
154                 free_extent_buffer(eb);
155         }
156         return eb;
157 }
158
159 /* cowonly root (everything not a reference counted cow subvolume), just get
160  * put onto a simple dirty list.  transaction.c walks this to make sure they
161  * get properly updated on disk.
162  */
163 static void add_root_to_dirty_list(struct btrfs_root *root)
164 {
165         struct btrfs_fs_info *fs_info = root->fs_info;
166
167         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
168             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
169                 return;
170
171         spin_lock(&fs_info->trans_lock);
172         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
173                 /* Want the extent tree to be the last on the list */
174                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
175                         list_move_tail(&root->dirty_list,
176                                        &fs_info->dirty_cowonly_roots);
177                 else
178                         list_move(&root->dirty_list,
179                                   &fs_info->dirty_cowonly_roots);
180         }
181         spin_unlock(&fs_info->trans_lock);
182 }
183
184 /*
185  * used by snapshot creation to make a copy of a root for a tree with
186  * a given objectid.  The buffer with the new root node is returned in
187  * cow_ret, and this func returns zero on success or a negative error code.
188  */
189 int btrfs_copy_root(struct btrfs_trans_handle *trans,
190                       struct btrfs_root *root,
191                       struct extent_buffer *buf,
192                       struct extent_buffer **cow_ret, u64 new_root_objectid)
193 {
194         struct btrfs_fs_info *fs_info = root->fs_info;
195         struct extent_buffer *cow;
196         int ret = 0;
197         int level;
198         struct btrfs_disk_key disk_key;
199
200         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
201                 trans->transid != fs_info->running_transaction->transid);
202         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
203                 trans->transid != root->last_trans);
204
205         level = btrfs_header_level(buf);
206         if (level == 0)
207                 btrfs_item_key(buf, &disk_key, 0);
208         else
209                 btrfs_node_key(buf, &disk_key, 0);
210
211         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
212                         &disk_key, level, buf->start, 0);
213         if (IS_ERR(cow))
214                 return PTR_ERR(cow);
215
216         copy_extent_buffer_full(cow, buf);
217         btrfs_set_header_bytenr(cow, cow->start);
218         btrfs_set_header_generation(cow, trans->transid);
219         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
220         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
221                                      BTRFS_HEADER_FLAG_RELOC);
222         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
223                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
224         else
225                 btrfs_set_header_owner(cow, new_root_objectid);
226
227         write_extent_buffer_fsid(cow, fs_info->fsid);
228
229         WARN_ON(btrfs_header_generation(buf) > trans->transid);
230         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
231                 ret = btrfs_inc_ref(trans, root, cow, 1);
232         else
233                 ret = btrfs_inc_ref(trans, root, cow, 0);
234
235         if (ret)
236                 return ret;
237
238         btrfs_mark_buffer_dirty(cow);
239         *cow_ret = cow;
240         return 0;
241 }
242
243 enum mod_log_op {
244         MOD_LOG_KEY_REPLACE,
245         MOD_LOG_KEY_ADD,
246         MOD_LOG_KEY_REMOVE,
247         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
248         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
249         MOD_LOG_MOVE_KEYS,
250         MOD_LOG_ROOT_REPLACE,
251 };
252
253 struct tree_mod_root {
254         u64 logical;
255         u8 level;
256 };
257
258 struct tree_mod_elem {
259         struct rb_node node;
260         u64 logical;
261         u64 seq;
262         enum mod_log_op op;
263
264         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
265         int slot;
266
267         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
268         u64 generation;
269
270         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
271         struct btrfs_disk_key key;
272         u64 blockptr;
273
274         /* this is used for op == MOD_LOG_MOVE_KEYS */
275         struct {
276                 int dst_slot;
277                 int nr_items;
278         } move;
279
280         /* this is used for op == MOD_LOG_ROOT_REPLACE */
281         struct tree_mod_root old_root;
282 };
283
284 /*
285  * Pull a new tree mod seq number for our operation.
286  */
287 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
288 {
289         return atomic64_inc_return(&fs_info->tree_mod_seq);
290 }
291
292 /*
293  * This adds a new blocker to the tree mod log's blocker list if the @elem
294  * passed does not already have a sequence number set. So when a caller expects
295  * to record tree modifications, it should ensure to set elem->seq to zero
296  * before calling btrfs_get_tree_mod_seq.
297  * Returns a fresh, unused tree log modification sequence number, even if no new
298  * blocker was added.
299  */
300 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
301                            struct seq_list *elem)
302 {
303         write_lock(&fs_info->tree_mod_log_lock);
304         spin_lock(&fs_info->tree_mod_seq_lock);
305         if (!elem->seq) {
306                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
307                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
308         }
309         spin_unlock(&fs_info->tree_mod_seq_lock);
310         write_unlock(&fs_info->tree_mod_log_lock);
311
312         return elem->seq;
313 }
314
315 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
316                             struct seq_list *elem)
317 {
318         struct rb_root *tm_root;
319         struct rb_node *node;
320         struct rb_node *next;
321         struct seq_list *cur_elem;
322         struct tree_mod_elem *tm;
323         u64 min_seq = (u64)-1;
324         u64 seq_putting = elem->seq;
325
326         if (!seq_putting)
327                 return;
328
329         spin_lock(&fs_info->tree_mod_seq_lock);
330         list_del(&elem->list);
331         elem->seq = 0;
332
333         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
334                 if (cur_elem->seq < min_seq) {
335                         if (seq_putting > cur_elem->seq) {
336                                 /*
337                                  * blocker with lower sequence number exists, we
338                                  * cannot remove anything from the log
339                                  */
340                                 spin_unlock(&fs_info->tree_mod_seq_lock);
341                                 return;
342                         }
343                         min_seq = cur_elem->seq;
344                 }
345         }
346         spin_unlock(&fs_info->tree_mod_seq_lock);
347
348         /*
349          * anything that's lower than the lowest existing (read: blocked)
350          * sequence number can be removed from the tree.
351          */
352         write_lock(&fs_info->tree_mod_log_lock);
353         tm_root = &fs_info->tree_mod_log;
354         for (node = rb_first(tm_root); node; node = next) {
355                 next = rb_next(node);
356                 tm = rb_entry(node, struct tree_mod_elem, node);
357                 if (tm->seq > min_seq)
358                         continue;
359                 rb_erase(node, tm_root);
360                 kfree(tm);
361         }
362         write_unlock(&fs_info->tree_mod_log_lock);
363 }
364
365 /*
366  * key order of the log:
367  *       node/leaf start address -> sequence
368  *
369  * The 'start address' is the logical address of the *new* root node
370  * for root replace operations, or the logical address of the affected
371  * block for all other operations.
372  *
373  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
374  */
375 static noinline int
376 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
377 {
378         struct rb_root *tm_root;
379         struct rb_node **new;
380         struct rb_node *parent = NULL;
381         struct tree_mod_elem *cur;
382
383         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
384
385         tm_root = &fs_info->tree_mod_log;
386         new = &tm_root->rb_node;
387         while (*new) {
388                 cur = rb_entry(*new, struct tree_mod_elem, node);
389                 parent = *new;
390                 if (cur->logical < tm->logical)
391                         new = &((*new)->rb_left);
392                 else if (cur->logical > tm->logical)
393                         new = &((*new)->rb_right);
394                 else if (cur->seq < tm->seq)
395                         new = &((*new)->rb_left);
396                 else if (cur->seq > tm->seq)
397                         new = &((*new)->rb_right);
398                 else
399                         return -EEXIST;
400         }
401
402         rb_link_node(&tm->node, parent, new);
403         rb_insert_color(&tm->node, tm_root);
404         return 0;
405 }
406
407 /*
408  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
409  * returns zero with the tree_mod_log_lock acquired. The caller must hold
410  * this until all tree mod log insertions are recorded in the rb tree and then
411  * write unlock fs_info::tree_mod_log_lock.
412  */
413 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
414                                     struct extent_buffer *eb) {
415         smp_mb();
416         if (list_empty(&(fs_info)->tree_mod_seq_list))
417                 return 1;
418         if (eb && btrfs_header_level(eb) == 0)
419                 return 1;
420
421         write_lock(&fs_info->tree_mod_log_lock);
422         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
423                 write_unlock(&fs_info->tree_mod_log_lock);
424                 return 1;
425         }
426
427         return 0;
428 }
429
430 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
431 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
432                                     struct extent_buffer *eb)
433 {
434         smp_mb();
435         if (list_empty(&(fs_info)->tree_mod_seq_list))
436                 return 0;
437         if (eb && btrfs_header_level(eb) == 0)
438                 return 0;
439
440         return 1;
441 }
442
443 static struct tree_mod_elem *
444 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
445                     enum mod_log_op op, gfp_t flags)
446 {
447         struct tree_mod_elem *tm;
448
449         tm = kzalloc(sizeof(*tm), flags);
450         if (!tm)
451                 return NULL;
452
453         tm->logical = eb->start;
454         if (op != MOD_LOG_KEY_ADD) {
455                 btrfs_node_key(eb, &tm->key, slot);
456                 tm->blockptr = btrfs_node_blockptr(eb, slot);
457         }
458         tm->op = op;
459         tm->slot = slot;
460         tm->generation = btrfs_node_ptr_generation(eb, slot);
461         RB_CLEAR_NODE(&tm->node);
462
463         return tm;
464 }
465
466 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
467                 enum mod_log_op op, gfp_t flags)
468 {
469         struct tree_mod_elem *tm;
470         int ret;
471
472         if (!tree_mod_need_log(eb->fs_info, eb))
473                 return 0;
474
475         tm = alloc_tree_mod_elem(eb, slot, op, flags);
476         if (!tm)
477                 return -ENOMEM;
478
479         if (tree_mod_dont_log(eb->fs_info, eb)) {
480                 kfree(tm);
481                 return 0;
482         }
483
484         ret = __tree_mod_log_insert(eb->fs_info, tm);
485         write_unlock(&eb->fs_info->tree_mod_log_lock);
486         if (ret)
487                 kfree(tm);
488
489         return ret;
490 }
491
492 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
493                 int dst_slot, int src_slot, int nr_items)
494 {
495         struct tree_mod_elem *tm = NULL;
496         struct tree_mod_elem **tm_list = NULL;
497         int ret = 0;
498         int i;
499         int locked = 0;
500
501         if (!tree_mod_need_log(eb->fs_info, eb))
502                 return 0;
503
504         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
505         if (!tm_list)
506                 return -ENOMEM;
507
508         tm = kzalloc(sizeof(*tm), GFP_NOFS);
509         if (!tm) {
510                 ret = -ENOMEM;
511                 goto free_tms;
512         }
513
514         tm->logical = eb->start;
515         tm->slot = src_slot;
516         tm->move.dst_slot = dst_slot;
517         tm->move.nr_items = nr_items;
518         tm->op = MOD_LOG_MOVE_KEYS;
519
520         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
521                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
522                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
523                 if (!tm_list[i]) {
524                         ret = -ENOMEM;
525                         goto free_tms;
526                 }
527         }
528
529         if (tree_mod_dont_log(eb->fs_info, eb))
530                 goto free_tms;
531         locked = 1;
532
533         /*
534          * When we override something during the move, we log these removals.
535          * This can only happen when we move towards the beginning of the
536          * buffer, i.e. dst_slot < src_slot.
537          */
538         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
539                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
540                 if (ret)
541                         goto free_tms;
542         }
543
544         ret = __tree_mod_log_insert(eb->fs_info, tm);
545         if (ret)
546                 goto free_tms;
547         write_unlock(&eb->fs_info->tree_mod_log_lock);
548         kfree(tm_list);
549
550         return 0;
551 free_tms:
552         for (i = 0; i < nr_items; i++) {
553                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
554                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
555                 kfree(tm_list[i]);
556         }
557         if (locked)
558                 write_unlock(&eb->fs_info->tree_mod_log_lock);
559         kfree(tm_list);
560         kfree(tm);
561
562         return ret;
563 }
564
565 static inline int
566 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
567                        struct tree_mod_elem **tm_list,
568                        int nritems)
569 {
570         int i, j;
571         int ret;
572
573         for (i = nritems - 1; i >= 0; i--) {
574                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
575                 if (ret) {
576                         for (j = nritems - 1; j > i; j--)
577                                 rb_erase(&tm_list[j]->node,
578                                          &fs_info->tree_mod_log);
579                         return ret;
580                 }
581         }
582
583         return 0;
584 }
585
586 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
587                          struct extent_buffer *new_root, int log_removal)
588 {
589         struct btrfs_fs_info *fs_info = old_root->fs_info;
590         struct tree_mod_elem *tm = NULL;
591         struct tree_mod_elem **tm_list = NULL;
592         int nritems = 0;
593         int ret = 0;
594         int i;
595
596         if (!tree_mod_need_log(fs_info, NULL))
597                 return 0;
598
599         if (log_removal && btrfs_header_level(old_root) > 0) {
600                 nritems = btrfs_header_nritems(old_root);
601                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
602                                   GFP_NOFS);
603                 if (!tm_list) {
604                         ret = -ENOMEM;
605                         goto free_tms;
606                 }
607                 for (i = 0; i < nritems; i++) {
608                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
609                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
610                         if (!tm_list[i]) {
611                                 ret = -ENOMEM;
612                                 goto free_tms;
613                         }
614                 }
615         }
616
617         tm = kzalloc(sizeof(*tm), GFP_NOFS);
618         if (!tm) {
619                 ret = -ENOMEM;
620                 goto free_tms;
621         }
622
623         tm->logical = new_root->start;
624         tm->old_root.logical = old_root->start;
625         tm->old_root.level = btrfs_header_level(old_root);
626         tm->generation = btrfs_header_generation(old_root);
627         tm->op = MOD_LOG_ROOT_REPLACE;
628
629         if (tree_mod_dont_log(fs_info, NULL))
630                 goto free_tms;
631
632         if (tm_list)
633                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
634         if (!ret)
635                 ret = __tree_mod_log_insert(fs_info, tm);
636
637         write_unlock(&fs_info->tree_mod_log_lock);
638         if (ret)
639                 goto free_tms;
640         kfree(tm_list);
641
642         return ret;
643
644 free_tms:
645         if (tm_list) {
646                 for (i = 0; i < nritems; i++)
647                         kfree(tm_list[i]);
648                 kfree(tm_list);
649         }
650         kfree(tm);
651
652         return ret;
653 }
654
655 static struct tree_mod_elem *
656 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
657                       int smallest)
658 {
659         struct rb_root *tm_root;
660         struct rb_node *node;
661         struct tree_mod_elem *cur = NULL;
662         struct tree_mod_elem *found = NULL;
663
664         read_lock(&fs_info->tree_mod_log_lock);
665         tm_root = &fs_info->tree_mod_log;
666         node = tm_root->rb_node;
667         while (node) {
668                 cur = rb_entry(node, struct tree_mod_elem, node);
669                 if (cur->logical < start) {
670                         node = node->rb_left;
671                 } else if (cur->logical > start) {
672                         node = node->rb_right;
673                 } else if (cur->seq < min_seq) {
674                         node = node->rb_left;
675                 } else if (!smallest) {
676                         /* we want the node with the highest seq */
677                         if (found)
678                                 BUG_ON(found->seq > cur->seq);
679                         found = cur;
680                         node = node->rb_left;
681                 } else if (cur->seq > min_seq) {
682                         /* we want the node with the smallest seq */
683                         if (found)
684                                 BUG_ON(found->seq < cur->seq);
685                         found = cur;
686                         node = node->rb_right;
687                 } else {
688                         found = cur;
689                         break;
690                 }
691         }
692         read_unlock(&fs_info->tree_mod_log_lock);
693
694         return found;
695 }
696
697 /*
698  * this returns the element from the log with the smallest time sequence
699  * value that's in the log (the oldest log item). any element with a time
700  * sequence lower than min_seq will be ignored.
701  */
702 static struct tree_mod_elem *
703 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
704                            u64 min_seq)
705 {
706         return __tree_mod_log_search(fs_info, start, min_seq, 1);
707 }
708
709 /*
710  * this returns the element from the log with the largest time sequence
711  * value that's in the log (the most recent log item). any element with
712  * a time sequence lower than min_seq will be ignored.
713  */
714 static struct tree_mod_elem *
715 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
716 {
717         return __tree_mod_log_search(fs_info, start, min_seq, 0);
718 }
719
720 static noinline int
721 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
722                      struct extent_buffer *src, unsigned long dst_offset,
723                      unsigned long src_offset, int nr_items)
724 {
725         int ret = 0;
726         struct tree_mod_elem **tm_list = NULL;
727         struct tree_mod_elem **tm_list_add, **tm_list_rem;
728         int i;
729         int locked = 0;
730
731         if (!tree_mod_need_log(fs_info, NULL))
732                 return 0;
733
734         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
735                 return 0;
736
737         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
738                           GFP_NOFS);
739         if (!tm_list)
740                 return -ENOMEM;
741
742         tm_list_add = tm_list;
743         tm_list_rem = tm_list + nr_items;
744         for (i = 0; i < nr_items; i++) {
745                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
746                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
747                 if (!tm_list_rem[i]) {
748                         ret = -ENOMEM;
749                         goto free_tms;
750                 }
751
752                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
753                     MOD_LOG_KEY_ADD, GFP_NOFS);
754                 if (!tm_list_add[i]) {
755                         ret = -ENOMEM;
756                         goto free_tms;
757                 }
758         }
759
760         if (tree_mod_dont_log(fs_info, NULL))
761                 goto free_tms;
762         locked = 1;
763
764         for (i = 0; i < nr_items; i++) {
765                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
766                 if (ret)
767                         goto free_tms;
768                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
769                 if (ret)
770                         goto free_tms;
771         }
772
773         write_unlock(&fs_info->tree_mod_log_lock);
774         kfree(tm_list);
775
776         return 0;
777
778 free_tms:
779         for (i = 0; i < nr_items * 2; i++) {
780                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
781                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
782                 kfree(tm_list[i]);
783         }
784         if (locked)
785                 write_unlock(&fs_info->tree_mod_log_lock);
786         kfree(tm_list);
787
788         return ret;
789 }
790
791 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
792 {
793         struct tree_mod_elem **tm_list = NULL;
794         int nritems = 0;
795         int i;
796         int ret = 0;
797
798         if (btrfs_header_level(eb) == 0)
799                 return 0;
800
801         if (!tree_mod_need_log(eb->fs_info, NULL))
802                 return 0;
803
804         nritems = btrfs_header_nritems(eb);
805         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
806         if (!tm_list)
807                 return -ENOMEM;
808
809         for (i = 0; i < nritems; i++) {
810                 tm_list[i] = alloc_tree_mod_elem(eb, i,
811                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
812                 if (!tm_list[i]) {
813                         ret = -ENOMEM;
814                         goto free_tms;
815                 }
816         }
817
818         if (tree_mod_dont_log(eb->fs_info, eb))
819                 goto free_tms;
820
821         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
822         write_unlock(&eb->fs_info->tree_mod_log_lock);
823         if (ret)
824                 goto free_tms;
825         kfree(tm_list);
826
827         return 0;
828
829 free_tms:
830         for (i = 0; i < nritems; i++)
831                 kfree(tm_list[i]);
832         kfree(tm_list);
833
834         return ret;
835 }
836
837 /*
838  * check if the tree block can be shared by multiple trees
839  */
840 int btrfs_block_can_be_shared(struct btrfs_root *root,
841                               struct extent_buffer *buf)
842 {
843         /*
844          * Tree blocks not in reference counted trees and tree roots
845          * are never shared. If a block was allocated after the last
846          * snapshot and the block was not allocated by tree relocation,
847          * we know the block is not shared.
848          */
849         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
850             buf != root->node && buf != root->commit_root &&
851             (btrfs_header_generation(buf) <=
852              btrfs_root_last_snapshot(&root->root_item) ||
853              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
854                 return 1;
855
856         return 0;
857 }
858
859 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
860                                        struct btrfs_root *root,
861                                        struct extent_buffer *buf,
862                                        struct extent_buffer *cow,
863                                        int *last_ref)
864 {
865         struct btrfs_fs_info *fs_info = root->fs_info;
866         u64 refs;
867         u64 owner;
868         u64 flags;
869         u64 new_flags = 0;
870         int ret;
871
872         /*
873          * Backrefs update rules:
874          *
875          * Always use full backrefs for extent pointers in tree block
876          * allocated by tree relocation.
877          *
878          * If a shared tree block is no longer referenced by its owner
879          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
880          * use full backrefs for extent pointers in tree block.
881          *
882          * If a tree block is been relocating
883          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
884          * use full backrefs for extent pointers in tree block.
885          * The reason for this is some operations (such as drop tree)
886          * are only allowed for blocks use full backrefs.
887          */
888
889         if (btrfs_block_can_be_shared(root, buf)) {
890                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
891                                                btrfs_header_level(buf), 1,
892                                                &refs, &flags);
893                 if (ret)
894                         return ret;
895                 if (refs == 0) {
896                         ret = -EROFS;
897                         btrfs_handle_fs_error(fs_info, ret, NULL);
898                         return ret;
899                 }
900         } else {
901                 refs = 1;
902                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
903                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
904                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
905                 else
906                         flags = 0;
907         }
908
909         owner = btrfs_header_owner(buf);
910         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
911                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
912
913         if (refs > 1) {
914                 if ((owner == root->root_key.objectid ||
915                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
916                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
917                         ret = btrfs_inc_ref(trans, root, buf, 1);
918                         if (ret)
919                                 return ret;
920
921                         if (root->root_key.objectid ==
922                             BTRFS_TREE_RELOC_OBJECTID) {
923                                 ret = btrfs_dec_ref(trans, root, buf, 0);
924                                 if (ret)
925                                         return ret;
926                                 ret = btrfs_inc_ref(trans, root, cow, 1);
927                                 if (ret)
928                                         return ret;
929                         }
930                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
931                 } else {
932
933                         if (root->root_key.objectid ==
934                             BTRFS_TREE_RELOC_OBJECTID)
935                                 ret = btrfs_inc_ref(trans, root, cow, 1);
936                         else
937                                 ret = btrfs_inc_ref(trans, root, cow, 0);
938                         if (ret)
939                                 return ret;
940                 }
941                 if (new_flags != 0) {
942                         int level = btrfs_header_level(buf);
943
944                         ret = btrfs_set_disk_extent_flags(trans, fs_info,
945                                                           buf->start,
946                                                           buf->len,
947                                                           new_flags, level, 0);
948                         if (ret)
949                                 return ret;
950                 }
951         } else {
952                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
953                         if (root->root_key.objectid ==
954                             BTRFS_TREE_RELOC_OBJECTID)
955                                 ret = btrfs_inc_ref(trans, root, cow, 1);
956                         else
957                                 ret = btrfs_inc_ref(trans, root, cow, 0);
958                         if (ret)
959                                 return ret;
960                         ret = btrfs_dec_ref(trans, root, buf, 1);
961                         if (ret)
962                                 return ret;
963                 }
964                 clean_tree_block(fs_info, buf);
965                 *last_ref = 1;
966         }
967         return 0;
968 }
969
970 /*
971  * does the dirty work in cow of a single block.  The parent block (if
972  * supplied) is updated to point to the new cow copy.  The new buffer is marked
973  * dirty and returned locked.  If you modify the block it needs to be marked
974  * dirty again.
975  *
976  * search_start -- an allocation hint for the new block
977  *
978  * empty_size -- a hint that you plan on doing more cow.  This is the size in
979  * bytes the allocator should try to find free next to the block it returns.
980  * This is just a hint and may be ignored by the allocator.
981  */
982 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
983                              struct btrfs_root *root,
984                              struct extent_buffer *buf,
985                              struct extent_buffer *parent, int parent_slot,
986                              struct extent_buffer **cow_ret,
987                              u64 search_start, u64 empty_size)
988 {
989         struct btrfs_fs_info *fs_info = root->fs_info;
990         struct btrfs_disk_key disk_key;
991         struct extent_buffer *cow;
992         int level, ret;
993         int last_ref = 0;
994         int unlock_orig = 0;
995         u64 parent_start = 0;
996
997         if (*cow_ret == buf)
998                 unlock_orig = 1;
999
1000         btrfs_assert_tree_locked(buf);
1001
1002         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1003                 trans->transid != fs_info->running_transaction->transid);
1004         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1005                 trans->transid != root->last_trans);
1006
1007         level = btrfs_header_level(buf);
1008
1009         if (level == 0)
1010                 btrfs_item_key(buf, &disk_key, 0);
1011         else
1012                 btrfs_node_key(buf, &disk_key, 0);
1013
1014         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1015                 parent_start = parent->start;
1016
1017         /*
1018          * If we are COWing a node/leaf from the extent, chunk or device trees,
1019          * make sure that we do not finish block group creation of pending block
1020          * groups. We do this to avoid a deadlock.
1021          * COWing can result in allocation of a new chunk, and flushing pending
1022          * block groups (btrfs_create_pending_block_groups()) can be triggered
1023          * when finishing allocation of a new chunk. Creation of a pending block
1024          * group modifies the extent, chunk and device trees, therefore we could
1025          * deadlock with ourselves since we are holding a lock on an extent
1026          * buffer that btrfs_create_pending_block_groups() may try to COW later.
1027          */
1028         if (root == fs_info->extent_root ||
1029             root == fs_info->chunk_root ||
1030             root == fs_info->dev_root)
1031                 trans->can_flush_pending_bgs = false;
1032
1033         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1034                         root->root_key.objectid, &disk_key, level,
1035                         search_start, empty_size);
1036         trans->can_flush_pending_bgs = true;
1037         if (IS_ERR(cow))
1038                 return PTR_ERR(cow);
1039
1040         /* cow is set to blocking by btrfs_init_new_buffer */
1041
1042         copy_extent_buffer_full(cow, buf);
1043         btrfs_set_header_bytenr(cow, cow->start);
1044         btrfs_set_header_generation(cow, trans->transid);
1045         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1046         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1047                                      BTRFS_HEADER_FLAG_RELOC);
1048         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1049                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1050         else
1051                 btrfs_set_header_owner(cow, root->root_key.objectid);
1052
1053         write_extent_buffer_fsid(cow, fs_info->fsid);
1054
1055         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1056         if (ret) {
1057                 btrfs_abort_transaction(trans, ret);
1058                 return ret;
1059         }
1060
1061         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1062                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1063                 if (ret) {
1064                         btrfs_abort_transaction(trans, ret);
1065                         return ret;
1066                 }
1067         }
1068
1069         if (buf == root->node) {
1070                 WARN_ON(parent && parent != buf);
1071                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1072                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1073                         parent_start = buf->start;
1074
1075                 extent_buffer_get(cow);
1076                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1077                 BUG_ON(ret < 0);
1078                 rcu_assign_pointer(root->node, cow);
1079
1080                 btrfs_free_tree_block(trans, root, buf, parent_start,
1081                                       last_ref);
1082                 free_extent_buffer(buf);
1083                 add_root_to_dirty_list(root);
1084         } else {
1085                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1086                 tree_mod_log_insert_key(parent, parent_slot,
1087                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1088                 btrfs_set_node_blockptr(parent, parent_slot,
1089                                         cow->start);
1090                 btrfs_set_node_ptr_generation(parent, parent_slot,
1091                                               trans->transid);
1092                 btrfs_mark_buffer_dirty(parent);
1093                 if (last_ref) {
1094                         ret = tree_mod_log_free_eb(buf);
1095                         if (ret) {
1096                                 btrfs_abort_transaction(trans, ret);
1097                                 return ret;
1098                         }
1099                 }
1100                 btrfs_free_tree_block(trans, root, buf, parent_start,
1101                                       last_ref);
1102         }
1103         if (unlock_orig)
1104                 btrfs_tree_unlock(buf);
1105         free_extent_buffer_stale(buf);
1106         btrfs_mark_buffer_dirty(cow);
1107         *cow_ret = cow;
1108         return 0;
1109 }
1110
1111 /*
1112  * returns the logical address of the oldest predecessor of the given root.
1113  * entries older than time_seq are ignored.
1114  */
1115 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1116                 struct extent_buffer *eb_root, u64 time_seq)
1117 {
1118         struct tree_mod_elem *tm;
1119         struct tree_mod_elem *found = NULL;
1120         u64 root_logical = eb_root->start;
1121         int looped = 0;
1122
1123         if (!time_seq)
1124                 return NULL;
1125
1126         /*
1127          * the very last operation that's logged for a root is the
1128          * replacement operation (if it is replaced at all). this has
1129          * the logical address of the *new* root, making it the very
1130          * first operation that's logged for this root.
1131          */
1132         while (1) {
1133                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1134                                                 time_seq);
1135                 if (!looped && !tm)
1136                         return NULL;
1137                 /*
1138                  * if there are no tree operation for the oldest root, we simply
1139                  * return it. this should only happen if that (old) root is at
1140                  * level 0.
1141                  */
1142                 if (!tm)
1143                         break;
1144
1145                 /*
1146                  * if there's an operation that's not a root replacement, we
1147                  * found the oldest version of our root. normally, we'll find a
1148                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1149                  */
1150                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1151                         break;
1152
1153                 found = tm;
1154                 root_logical = tm->old_root.logical;
1155                 looped = 1;
1156         }
1157
1158         /* if there's no old root to return, return what we found instead */
1159         if (!found)
1160                 found = tm;
1161
1162         return found;
1163 }
1164
1165 /*
1166  * tm is a pointer to the first operation to rewind within eb. then, all
1167  * previous operations will be rewound (until we reach something older than
1168  * time_seq).
1169  */
1170 static void
1171 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1172                       u64 time_seq, struct tree_mod_elem *first_tm)
1173 {
1174         u32 n;
1175         struct rb_node *next;
1176         struct tree_mod_elem *tm = first_tm;
1177         unsigned long o_dst;
1178         unsigned long o_src;
1179         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1180
1181         n = btrfs_header_nritems(eb);
1182         read_lock(&fs_info->tree_mod_log_lock);
1183         while (tm && tm->seq >= time_seq) {
1184                 /*
1185                  * all the operations are recorded with the operator used for
1186                  * the modification. as we're going backwards, we do the
1187                  * opposite of each operation here.
1188                  */
1189                 switch (tm->op) {
1190                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1191                         BUG_ON(tm->slot < n);
1192                         /* Fallthrough */
1193                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1194                 case MOD_LOG_KEY_REMOVE:
1195                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1196                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1197                         btrfs_set_node_ptr_generation(eb, tm->slot,
1198                                                       tm->generation);
1199                         n++;
1200                         break;
1201                 case MOD_LOG_KEY_REPLACE:
1202                         BUG_ON(tm->slot >= n);
1203                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1204                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1205                         btrfs_set_node_ptr_generation(eb, tm->slot,
1206                                                       tm->generation);
1207                         break;
1208                 case MOD_LOG_KEY_ADD:
1209                         /* if a move operation is needed it's in the log */
1210                         n--;
1211                         break;
1212                 case MOD_LOG_MOVE_KEYS:
1213                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1214                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1215                         memmove_extent_buffer(eb, o_dst, o_src,
1216                                               tm->move.nr_items * p_size);
1217                         break;
1218                 case MOD_LOG_ROOT_REPLACE:
1219                         /*
1220                          * this operation is special. for roots, this must be
1221                          * handled explicitly before rewinding.
1222                          * for non-roots, this operation may exist if the node
1223                          * was a root: root A -> child B; then A gets empty and
1224                          * B is promoted to the new root. in the mod log, we'll
1225                          * have a root-replace operation for B, a tree block
1226                          * that is no root. we simply ignore that operation.
1227                          */
1228                         break;
1229                 }
1230                 next = rb_next(&tm->node);
1231                 if (!next)
1232                         break;
1233                 tm = rb_entry(next, struct tree_mod_elem, node);
1234                 if (tm->logical != first_tm->logical)
1235                         break;
1236         }
1237         read_unlock(&fs_info->tree_mod_log_lock);
1238         btrfs_set_header_nritems(eb, n);
1239 }
1240
1241 /*
1242  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1243  * is returned. If rewind operations happen, a fresh buffer is returned. The
1244  * returned buffer is always read-locked. If the returned buffer is not the
1245  * input buffer, the lock on the input buffer is released and the input buffer
1246  * is freed (its refcount is decremented).
1247  */
1248 static struct extent_buffer *
1249 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1250                     struct extent_buffer *eb, u64 time_seq)
1251 {
1252         struct extent_buffer *eb_rewin;
1253         struct tree_mod_elem *tm;
1254
1255         if (!time_seq)
1256                 return eb;
1257
1258         if (btrfs_header_level(eb) == 0)
1259                 return eb;
1260
1261         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1262         if (!tm)
1263                 return eb;
1264
1265         btrfs_set_path_blocking(path);
1266         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1267
1268         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1269                 BUG_ON(tm->slot != 0);
1270                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1271                 if (!eb_rewin) {
1272                         btrfs_tree_read_unlock_blocking(eb);
1273                         free_extent_buffer(eb);
1274                         return NULL;
1275                 }
1276                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1277                 btrfs_set_header_backref_rev(eb_rewin,
1278                                              btrfs_header_backref_rev(eb));
1279                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1280                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1281         } else {
1282                 eb_rewin = btrfs_clone_extent_buffer(eb);
1283                 if (!eb_rewin) {
1284                         btrfs_tree_read_unlock_blocking(eb);
1285                         free_extent_buffer(eb);
1286                         return NULL;
1287                 }
1288         }
1289
1290         btrfs_tree_read_unlock_blocking(eb);
1291         free_extent_buffer(eb);
1292
1293         extent_buffer_get(eb_rewin);
1294         btrfs_tree_read_lock(eb_rewin);
1295         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1296         WARN_ON(btrfs_header_nritems(eb_rewin) >
1297                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1298
1299         return eb_rewin;
1300 }
1301
1302 /*
1303  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1304  * value. If there are no changes, the current root->root_node is returned. If
1305  * anything changed in between, there's a fresh buffer allocated on which the
1306  * rewind operations are done. In any case, the returned buffer is read locked.
1307  * Returns NULL on error (with no locks held).
1308  */
1309 static inline struct extent_buffer *
1310 get_old_root(struct btrfs_root *root, u64 time_seq)
1311 {
1312         struct btrfs_fs_info *fs_info = root->fs_info;
1313         struct tree_mod_elem *tm;
1314         struct extent_buffer *eb = NULL;
1315         struct extent_buffer *eb_root;
1316         struct extent_buffer *old;
1317         struct tree_mod_root *old_root = NULL;
1318         u64 old_generation = 0;
1319         u64 logical;
1320         int level;
1321
1322         eb_root = btrfs_read_lock_root_node(root);
1323         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1324         if (!tm)
1325                 return eb_root;
1326
1327         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1328                 old_root = &tm->old_root;
1329                 old_generation = tm->generation;
1330                 logical = old_root->logical;
1331                 level = old_root->level;
1332         } else {
1333                 logical = eb_root->start;
1334                 level = btrfs_header_level(eb_root);
1335         }
1336
1337         tm = tree_mod_log_search(fs_info, logical, time_seq);
1338         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1339                 btrfs_tree_read_unlock(eb_root);
1340                 free_extent_buffer(eb_root);
1341                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1342                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1343                         if (!IS_ERR(old))
1344                                 free_extent_buffer(old);
1345                         btrfs_warn(fs_info,
1346                                    "failed to read tree block %llu from get_old_root",
1347                                    logical);
1348                 } else {
1349                         eb = btrfs_clone_extent_buffer(old);
1350                         free_extent_buffer(old);
1351                 }
1352         } else if (old_root) {
1353                 btrfs_tree_read_unlock(eb_root);
1354                 free_extent_buffer(eb_root);
1355                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1356         } else {
1357                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1358                 eb = btrfs_clone_extent_buffer(eb_root);
1359                 btrfs_tree_read_unlock_blocking(eb_root);
1360                 free_extent_buffer(eb_root);
1361         }
1362
1363         if (!eb)
1364                 return NULL;
1365         extent_buffer_get(eb);
1366         btrfs_tree_read_lock(eb);
1367         if (old_root) {
1368                 btrfs_set_header_bytenr(eb, eb->start);
1369                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1370                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1371                 btrfs_set_header_level(eb, old_root->level);
1372                 btrfs_set_header_generation(eb, old_generation);
1373         }
1374         if (tm)
1375                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1376         else
1377                 WARN_ON(btrfs_header_level(eb) != 0);
1378         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1379
1380         return eb;
1381 }
1382
1383 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1384 {
1385         struct tree_mod_elem *tm;
1386         int level;
1387         struct extent_buffer *eb_root = btrfs_root_node(root);
1388
1389         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1390         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1391                 level = tm->old_root.level;
1392         } else {
1393                 level = btrfs_header_level(eb_root);
1394         }
1395         free_extent_buffer(eb_root);
1396
1397         return level;
1398 }
1399
1400 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1401                                    struct btrfs_root *root,
1402                                    struct extent_buffer *buf)
1403 {
1404         if (btrfs_is_testing(root->fs_info))
1405                 return 0;
1406
1407         /* Ensure we can see the FORCE_COW bit */
1408         smp_mb__before_atomic();
1409
1410         /*
1411          * We do not need to cow a block if
1412          * 1) this block is not created or changed in this transaction;
1413          * 2) this block does not belong to TREE_RELOC tree;
1414          * 3) the root is not forced COW.
1415          *
1416          * What is forced COW:
1417          *    when we create snapshot during committing the transaction,
1418          *    after we've finished coping src root, we must COW the shared
1419          *    block to ensure the metadata consistency.
1420          */
1421         if (btrfs_header_generation(buf) == trans->transid &&
1422             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1423             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1424               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1425             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1426                 return 0;
1427         return 1;
1428 }
1429
1430 /*
1431  * cows a single block, see __btrfs_cow_block for the real work.
1432  * This version of it has extra checks so that a block isn't COWed more than
1433  * once per transaction, as long as it hasn't been written yet
1434  */
1435 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1436                     struct btrfs_root *root, struct extent_buffer *buf,
1437                     struct extent_buffer *parent, int parent_slot,
1438                     struct extent_buffer **cow_ret)
1439 {
1440         struct btrfs_fs_info *fs_info = root->fs_info;
1441         u64 search_start;
1442         int ret;
1443
1444         if (trans->transaction != fs_info->running_transaction)
1445                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1446                        trans->transid,
1447                        fs_info->running_transaction->transid);
1448
1449         if (trans->transid != fs_info->generation)
1450                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1451                        trans->transid, fs_info->generation);
1452
1453         if (!should_cow_block(trans, root, buf)) {
1454                 trans->dirty = true;
1455                 *cow_ret = buf;
1456                 return 0;
1457         }
1458
1459         search_start = buf->start & ~((u64)SZ_1G - 1);
1460
1461         if (parent)
1462                 btrfs_set_lock_blocking(parent);
1463         btrfs_set_lock_blocking(buf);
1464
1465         ret = __btrfs_cow_block(trans, root, buf, parent,
1466                                  parent_slot, cow_ret, search_start, 0);
1467
1468         trace_btrfs_cow_block(root, buf, *cow_ret);
1469
1470         return ret;
1471 }
1472
1473 /*
1474  * helper function for defrag to decide if two blocks pointed to by a
1475  * node are actually close by
1476  */
1477 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1478 {
1479         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1480                 return 1;
1481         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1482                 return 1;
1483         return 0;
1484 }
1485
1486 /*
1487  * compare two keys in a memcmp fashion
1488  */
1489 static int comp_keys(const struct btrfs_disk_key *disk,
1490                      const struct btrfs_key *k2)
1491 {
1492         struct btrfs_key k1;
1493
1494         btrfs_disk_key_to_cpu(&k1, disk);
1495
1496         return btrfs_comp_cpu_keys(&k1, k2);
1497 }
1498
1499 /*
1500  * same as comp_keys only with two btrfs_key's
1501  */
1502 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1503 {
1504         if (k1->objectid > k2->objectid)
1505                 return 1;
1506         if (k1->objectid < k2->objectid)
1507                 return -1;
1508         if (k1->type > k2->type)
1509                 return 1;
1510         if (k1->type < k2->type)
1511                 return -1;
1512         if (k1->offset > k2->offset)
1513                 return 1;
1514         if (k1->offset < k2->offset)
1515                 return -1;
1516         return 0;
1517 }
1518
1519 /*
1520  * this is used by the defrag code to go through all the
1521  * leaves pointed to by a node and reallocate them so that
1522  * disk order is close to key order
1523  */
1524 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1525                        struct btrfs_root *root, struct extent_buffer *parent,
1526                        int start_slot, u64 *last_ret,
1527                        struct btrfs_key *progress)
1528 {
1529         struct btrfs_fs_info *fs_info = root->fs_info;
1530         struct extent_buffer *cur;
1531         u64 blocknr;
1532         u64 gen;
1533         u64 search_start = *last_ret;
1534         u64 last_block = 0;
1535         u64 other;
1536         u32 parent_nritems;
1537         int end_slot;
1538         int i;
1539         int err = 0;
1540         int parent_level;
1541         int uptodate;
1542         u32 blocksize;
1543         int progress_passed = 0;
1544         struct btrfs_disk_key disk_key;
1545
1546         parent_level = btrfs_header_level(parent);
1547
1548         WARN_ON(trans->transaction != fs_info->running_transaction);
1549         WARN_ON(trans->transid != fs_info->generation);
1550
1551         parent_nritems = btrfs_header_nritems(parent);
1552         blocksize = fs_info->nodesize;
1553         end_slot = parent_nritems - 1;
1554
1555         if (parent_nritems <= 1)
1556                 return 0;
1557
1558         btrfs_set_lock_blocking(parent);
1559
1560         for (i = start_slot; i <= end_slot; i++) {
1561                 struct btrfs_key first_key;
1562                 int close = 1;
1563
1564                 btrfs_node_key(parent, &disk_key, i);
1565                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1566                         continue;
1567
1568                 progress_passed = 1;
1569                 blocknr = btrfs_node_blockptr(parent, i);
1570                 gen = btrfs_node_ptr_generation(parent, i);
1571                 btrfs_node_key_to_cpu(parent, &first_key, i);
1572                 if (last_block == 0)
1573                         last_block = blocknr;
1574
1575                 if (i > 0) {
1576                         other = btrfs_node_blockptr(parent, i - 1);
1577                         close = close_blocks(blocknr, other, blocksize);
1578                 }
1579                 if (!close && i < end_slot) {
1580                         other = btrfs_node_blockptr(parent, i + 1);
1581                         close = close_blocks(blocknr, other, blocksize);
1582                 }
1583                 if (close) {
1584                         last_block = blocknr;
1585                         continue;
1586                 }
1587
1588                 cur = find_extent_buffer(fs_info, blocknr);
1589                 if (cur)
1590                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1591                 else
1592                         uptodate = 0;
1593                 if (!cur || !uptodate) {
1594                         if (!cur) {
1595                                 cur = read_tree_block(fs_info, blocknr, gen,
1596                                                       parent_level - 1,
1597                                                       &first_key);
1598                                 if (IS_ERR(cur)) {
1599                                         return PTR_ERR(cur);
1600                                 } else if (!extent_buffer_uptodate(cur)) {
1601                                         free_extent_buffer(cur);
1602                                         return -EIO;
1603                                 }
1604                         } else if (!uptodate) {
1605                                 err = btrfs_read_buffer(cur, gen,
1606                                                 parent_level - 1,&first_key);
1607                                 if (err) {
1608                                         free_extent_buffer(cur);
1609                                         return err;
1610                                 }
1611                         }
1612                 }
1613                 if (search_start == 0)
1614                         search_start = last_block;
1615
1616                 btrfs_tree_lock(cur);
1617                 btrfs_set_lock_blocking(cur);
1618                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1619                                         &cur, search_start,
1620                                         min(16 * blocksize,
1621                                             (end_slot - i) * blocksize));
1622                 if (err) {
1623                         btrfs_tree_unlock(cur);
1624                         free_extent_buffer(cur);
1625                         break;
1626                 }
1627                 search_start = cur->start;
1628                 last_block = cur->start;
1629                 *last_ret = search_start;
1630                 btrfs_tree_unlock(cur);
1631                 free_extent_buffer(cur);
1632         }
1633         return err;
1634 }
1635
1636 /*
1637  * search for key in the extent_buffer.  The items start at offset p,
1638  * and they are item_size apart.  There are 'max' items in p.
1639  *
1640  * the slot in the array is returned via slot, and it points to
1641  * the place where you would insert key if it is not found in
1642  * the array.
1643  *
1644  * slot may point to max if the key is bigger than all of the keys
1645  */
1646 static noinline int generic_bin_search(struct extent_buffer *eb,
1647                                        unsigned long p, int item_size,
1648                                        const struct btrfs_key *key,
1649                                        int max, int *slot)
1650 {
1651         int low = 0;
1652         int high = max;
1653         int mid;
1654         int ret;
1655         struct btrfs_disk_key *tmp = NULL;
1656         struct btrfs_disk_key unaligned;
1657         unsigned long offset;
1658         char *kaddr = NULL;
1659         unsigned long map_start = 0;
1660         unsigned long map_len = 0;
1661         int err;
1662
1663         if (low > high) {
1664                 btrfs_err(eb->fs_info,
1665                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1666                           __func__, low, high, eb->start,
1667                           btrfs_header_owner(eb), btrfs_header_level(eb));
1668                 return -EINVAL;
1669         }
1670
1671         while (low < high) {
1672                 mid = (low + high) / 2;
1673                 offset = p + mid * item_size;
1674
1675                 if (!kaddr || offset < map_start ||
1676                     (offset + sizeof(struct btrfs_disk_key)) >
1677                     map_start + map_len) {
1678
1679                         err = map_private_extent_buffer(eb, offset,
1680                                                 sizeof(struct btrfs_disk_key),
1681                                                 &kaddr, &map_start, &map_len);
1682
1683                         if (!err) {
1684                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1685                                                         map_start);
1686                         } else if (err == 1) {
1687                                 read_extent_buffer(eb, &unaligned,
1688                                                    offset, sizeof(unaligned));
1689                                 tmp = &unaligned;
1690                         } else {
1691                                 return err;
1692                         }
1693
1694                 } else {
1695                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1696                                                         map_start);
1697                 }
1698                 ret = comp_keys(tmp, key);
1699
1700                 if (ret < 0)
1701                         low = mid + 1;
1702                 else if (ret > 0)
1703                         high = mid;
1704                 else {
1705                         *slot = mid;
1706                         return 0;
1707                 }
1708         }
1709         *slot = low;
1710         return 1;
1711 }
1712
1713 /*
1714  * simple bin_search frontend that does the right thing for
1715  * leaves vs nodes
1716  */
1717 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1718                      int level, int *slot)
1719 {
1720         if (level == 0)
1721                 return generic_bin_search(eb,
1722                                           offsetof(struct btrfs_leaf, items),
1723                                           sizeof(struct btrfs_item),
1724                                           key, btrfs_header_nritems(eb),
1725                                           slot);
1726         else
1727                 return generic_bin_search(eb,
1728                                           offsetof(struct btrfs_node, ptrs),
1729                                           sizeof(struct btrfs_key_ptr),
1730                                           key, btrfs_header_nritems(eb),
1731                                           slot);
1732 }
1733
1734 static void root_add_used(struct btrfs_root *root, u32 size)
1735 {
1736         spin_lock(&root->accounting_lock);
1737         btrfs_set_root_used(&root->root_item,
1738                             btrfs_root_used(&root->root_item) + size);
1739         spin_unlock(&root->accounting_lock);
1740 }
1741
1742 static void root_sub_used(struct btrfs_root *root, u32 size)
1743 {
1744         spin_lock(&root->accounting_lock);
1745         btrfs_set_root_used(&root->root_item,
1746                             btrfs_root_used(&root->root_item) - size);
1747         spin_unlock(&root->accounting_lock);
1748 }
1749
1750 /* given a node and slot number, this reads the blocks it points to.  The
1751  * extent buffer is returned with a reference taken (but unlocked).
1752  */
1753 static noinline struct extent_buffer *
1754 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1755                int slot)
1756 {
1757         int level = btrfs_header_level(parent);
1758         struct extent_buffer *eb;
1759         struct btrfs_key first_key;
1760
1761         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1762                 return ERR_PTR(-ENOENT);
1763
1764         BUG_ON(level == 0);
1765
1766         btrfs_node_key_to_cpu(parent, &first_key, slot);
1767         eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1768                              btrfs_node_ptr_generation(parent, slot),
1769                              level - 1, &first_key);
1770         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1771                 free_extent_buffer(eb);
1772                 eb = ERR_PTR(-EIO);
1773         }
1774
1775         return eb;
1776 }
1777
1778 /*
1779  * node level balancing, used to make sure nodes are in proper order for
1780  * item deletion.  We balance from the top down, so we have to make sure
1781  * that a deletion won't leave an node completely empty later on.
1782  */
1783 static noinline int balance_level(struct btrfs_trans_handle *trans,
1784                          struct btrfs_root *root,
1785                          struct btrfs_path *path, int level)
1786 {
1787         struct btrfs_fs_info *fs_info = root->fs_info;
1788         struct extent_buffer *right = NULL;
1789         struct extent_buffer *mid;
1790         struct extent_buffer *left = NULL;
1791         struct extent_buffer *parent = NULL;
1792         int ret = 0;
1793         int wret;
1794         int pslot;
1795         int orig_slot = path->slots[level];
1796         u64 orig_ptr;
1797
1798         ASSERT(level > 0);
1799
1800         mid = path->nodes[level];
1801
1802         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1803                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1804         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1805
1806         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1807
1808         if (level < BTRFS_MAX_LEVEL - 1) {
1809                 parent = path->nodes[level + 1];
1810                 pslot = path->slots[level + 1];
1811         }
1812
1813         /*
1814          * deal with the case where there is only one pointer in the root
1815          * by promoting the node below to a root
1816          */
1817         if (!parent) {
1818                 struct extent_buffer *child;
1819
1820                 if (btrfs_header_nritems(mid) != 1)
1821                         return 0;
1822
1823                 /* promote the child to a root */
1824                 child = read_node_slot(fs_info, mid, 0);
1825                 if (IS_ERR(child)) {
1826                         ret = PTR_ERR(child);
1827                         btrfs_handle_fs_error(fs_info, ret, NULL);
1828                         goto enospc;
1829                 }
1830
1831                 btrfs_tree_lock(child);
1832                 btrfs_set_lock_blocking(child);
1833                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1834                 if (ret) {
1835                         btrfs_tree_unlock(child);
1836                         free_extent_buffer(child);
1837                         goto enospc;
1838                 }
1839
1840                 ret = tree_mod_log_insert_root(root->node, child, 1);
1841                 BUG_ON(ret < 0);
1842                 rcu_assign_pointer(root->node, child);
1843
1844                 add_root_to_dirty_list(root);
1845                 btrfs_tree_unlock(child);
1846
1847                 path->locks[level] = 0;
1848                 path->nodes[level] = NULL;
1849                 clean_tree_block(fs_info, mid);
1850                 btrfs_tree_unlock(mid);
1851                 /* once for the path */
1852                 free_extent_buffer(mid);
1853
1854                 root_sub_used(root, mid->len);
1855                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1856                 /* once for the root ptr */
1857                 free_extent_buffer_stale(mid);
1858                 return 0;
1859         }
1860         if (btrfs_header_nritems(mid) >
1861             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1862                 return 0;
1863
1864         left = read_node_slot(fs_info, parent, pslot - 1);
1865         if (IS_ERR(left))
1866                 left = NULL;
1867
1868         if (left) {
1869                 btrfs_tree_lock(left);
1870                 btrfs_set_lock_blocking(left);
1871                 wret = btrfs_cow_block(trans, root, left,
1872                                        parent, pslot - 1, &left);
1873                 if (wret) {
1874                         ret = wret;
1875                         goto enospc;
1876                 }
1877         }
1878
1879         right = read_node_slot(fs_info, parent, pslot + 1);
1880         if (IS_ERR(right))
1881                 right = NULL;
1882
1883         if (right) {
1884                 btrfs_tree_lock(right);
1885                 btrfs_set_lock_blocking(right);
1886                 wret = btrfs_cow_block(trans, root, right,
1887                                        parent, pslot + 1, &right);
1888                 if (wret) {
1889                         ret = wret;
1890                         goto enospc;
1891                 }
1892         }
1893
1894         /* first, try to make some room in the middle buffer */
1895         if (left) {
1896                 orig_slot += btrfs_header_nritems(left);
1897                 wret = push_node_left(trans, fs_info, left, mid, 1);
1898                 if (wret < 0)
1899                         ret = wret;
1900         }
1901
1902         /*
1903          * then try to empty the right most buffer into the middle
1904          */
1905         if (right) {
1906                 wret = push_node_left(trans, fs_info, mid, right, 1);
1907                 if (wret < 0 && wret != -ENOSPC)
1908                         ret = wret;
1909                 if (btrfs_header_nritems(right) == 0) {
1910                         clean_tree_block(fs_info, right);
1911                         btrfs_tree_unlock(right);
1912                         del_ptr(root, path, level + 1, pslot + 1);
1913                         root_sub_used(root, right->len);
1914                         btrfs_free_tree_block(trans, root, right, 0, 1);
1915                         free_extent_buffer_stale(right);
1916                         right = NULL;
1917                 } else {
1918                         struct btrfs_disk_key right_key;
1919                         btrfs_node_key(right, &right_key, 0);
1920                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1921                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1922                         BUG_ON(ret < 0);
1923                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1924                         btrfs_mark_buffer_dirty(parent);
1925                 }
1926         }
1927         if (btrfs_header_nritems(mid) == 1) {
1928                 /*
1929                  * we're not allowed to leave a node with one item in the
1930                  * tree during a delete.  A deletion from lower in the tree
1931                  * could try to delete the only pointer in this node.
1932                  * So, pull some keys from the left.
1933                  * There has to be a left pointer at this point because
1934                  * otherwise we would have pulled some pointers from the
1935                  * right
1936                  */
1937                 if (!left) {
1938                         ret = -EROFS;
1939                         btrfs_handle_fs_error(fs_info, ret, NULL);
1940                         goto enospc;
1941                 }
1942                 wret = balance_node_right(trans, fs_info, mid, left);
1943                 if (wret < 0) {
1944                         ret = wret;
1945                         goto enospc;
1946                 }
1947                 if (wret == 1) {
1948                         wret = push_node_left(trans, fs_info, left, mid, 1);
1949                         if (wret < 0)
1950                                 ret = wret;
1951                 }
1952                 BUG_ON(wret == 1);
1953         }
1954         if (btrfs_header_nritems(mid) == 0) {
1955                 clean_tree_block(fs_info, mid);
1956                 btrfs_tree_unlock(mid);
1957                 del_ptr(root, path, level + 1, pslot);
1958                 root_sub_used(root, mid->len);
1959                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1960                 free_extent_buffer_stale(mid);
1961                 mid = NULL;
1962         } else {
1963                 /* update the parent key to reflect our changes */
1964                 struct btrfs_disk_key mid_key;
1965                 btrfs_node_key(mid, &mid_key, 0);
1966                 ret = tree_mod_log_insert_key(parent, pslot,
1967                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1968                 BUG_ON(ret < 0);
1969                 btrfs_set_node_key(parent, &mid_key, pslot);
1970                 btrfs_mark_buffer_dirty(parent);
1971         }
1972
1973         /* update the path */
1974         if (left) {
1975                 if (btrfs_header_nritems(left) > orig_slot) {
1976                         extent_buffer_get(left);
1977                         /* left was locked after cow */
1978                         path->nodes[level] = left;
1979                         path->slots[level + 1] -= 1;
1980                         path->slots[level] = orig_slot;
1981                         if (mid) {
1982                                 btrfs_tree_unlock(mid);
1983                                 free_extent_buffer(mid);
1984                         }
1985                 } else {
1986                         orig_slot -= btrfs_header_nritems(left);
1987                         path->slots[level] = orig_slot;
1988                 }
1989         }
1990         /* double check we haven't messed things up */
1991         if (orig_ptr !=
1992             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1993                 BUG();
1994 enospc:
1995         if (right) {
1996                 btrfs_tree_unlock(right);
1997                 free_extent_buffer(right);
1998         }
1999         if (left) {
2000                 if (path->nodes[level] != left)
2001                         btrfs_tree_unlock(left);
2002                 free_extent_buffer(left);
2003         }
2004         return ret;
2005 }
2006
2007 /* Node balancing for insertion.  Here we only split or push nodes around
2008  * when they are completely full.  This is also done top down, so we
2009  * have to be pessimistic.
2010  */
2011 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2012                                           struct btrfs_root *root,
2013                                           struct btrfs_path *path, int level)
2014 {
2015         struct btrfs_fs_info *fs_info = root->fs_info;
2016         struct extent_buffer *right = NULL;
2017         struct extent_buffer *mid;
2018         struct extent_buffer *left = NULL;
2019         struct extent_buffer *parent = NULL;
2020         int ret = 0;
2021         int wret;
2022         int pslot;
2023         int orig_slot = path->slots[level];
2024
2025         if (level == 0)
2026                 return 1;
2027
2028         mid = path->nodes[level];
2029         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2030
2031         if (level < BTRFS_MAX_LEVEL - 1) {
2032                 parent = path->nodes[level + 1];
2033                 pslot = path->slots[level + 1];
2034         }
2035
2036         if (!parent)
2037                 return 1;
2038
2039         left = read_node_slot(fs_info, parent, pslot - 1);
2040         if (IS_ERR(left))
2041                 left = NULL;
2042
2043         /* first, try to make some room in the middle buffer */
2044         if (left) {
2045                 u32 left_nr;
2046
2047                 btrfs_tree_lock(left);
2048                 btrfs_set_lock_blocking(left);
2049
2050                 left_nr = btrfs_header_nritems(left);
2051                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2052                         wret = 1;
2053                 } else {
2054                         ret = btrfs_cow_block(trans, root, left, parent,
2055                                               pslot - 1, &left);
2056                         if (ret)
2057                                 wret = 1;
2058                         else {
2059                                 wret = push_node_left(trans, fs_info,
2060                                                       left, mid, 0);
2061                         }
2062                 }
2063                 if (wret < 0)
2064                         ret = wret;
2065                 if (wret == 0) {
2066                         struct btrfs_disk_key disk_key;
2067                         orig_slot += left_nr;
2068                         btrfs_node_key(mid, &disk_key, 0);
2069                         ret = tree_mod_log_insert_key(parent, pslot,
2070                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2071                         BUG_ON(ret < 0);
2072                         btrfs_set_node_key(parent, &disk_key, pslot);
2073                         btrfs_mark_buffer_dirty(parent);
2074                         if (btrfs_header_nritems(left) > orig_slot) {
2075                                 path->nodes[level] = left;
2076                                 path->slots[level + 1] -= 1;
2077                                 path->slots[level] = orig_slot;
2078                                 btrfs_tree_unlock(mid);
2079                                 free_extent_buffer(mid);
2080                         } else {
2081                                 orig_slot -=
2082                                         btrfs_header_nritems(left);
2083                                 path->slots[level] = orig_slot;
2084                                 btrfs_tree_unlock(left);
2085                                 free_extent_buffer(left);
2086                         }
2087                         return 0;
2088                 }
2089                 btrfs_tree_unlock(left);
2090                 free_extent_buffer(left);
2091         }
2092         right = read_node_slot(fs_info, parent, pslot + 1);
2093         if (IS_ERR(right))
2094                 right = NULL;
2095
2096         /*
2097          * then try to empty the right most buffer into the middle
2098          */
2099         if (right) {
2100                 u32 right_nr;
2101
2102                 btrfs_tree_lock(right);
2103                 btrfs_set_lock_blocking(right);
2104
2105                 right_nr = btrfs_header_nritems(right);
2106                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2107                         wret = 1;
2108                 } else {
2109                         ret = btrfs_cow_block(trans, root, right,
2110                                               parent, pslot + 1,
2111                                               &right);
2112                         if (ret)
2113                                 wret = 1;
2114                         else {
2115                                 wret = balance_node_right(trans, fs_info,
2116                                                           right, mid);
2117                         }
2118                 }
2119                 if (wret < 0)
2120                         ret = wret;
2121                 if (wret == 0) {
2122                         struct btrfs_disk_key disk_key;
2123
2124                         btrfs_node_key(right, &disk_key, 0);
2125                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2126                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2127                         BUG_ON(ret < 0);
2128                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2129                         btrfs_mark_buffer_dirty(parent);
2130
2131                         if (btrfs_header_nritems(mid) <= orig_slot) {
2132                                 path->nodes[level] = right;
2133                                 path->slots[level + 1] += 1;
2134                                 path->slots[level] = orig_slot -
2135                                         btrfs_header_nritems(mid);
2136                                 btrfs_tree_unlock(mid);
2137                                 free_extent_buffer(mid);
2138                         } else {
2139                                 btrfs_tree_unlock(right);
2140                                 free_extent_buffer(right);
2141                         }
2142                         return 0;
2143                 }
2144                 btrfs_tree_unlock(right);
2145                 free_extent_buffer(right);
2146         }
2147         return 1;
2148 }
2149
2150 /*
2151  * readahead one full node of leaves, finding things that are close
2152  * to the block in 'slot', and triggering ra on them.
2153  */
2154 static void reada_for_search(struct btrfs_fs_info *fs_info,
2155                              struct btrfs_path *path,
2156                              int level, int slot, u64 objectid)
2157 {
2158         struct extent_buffer *node;
2159         struct btrfs_disk_key disk_key;
2160         u32 nritems;
2161         u64 search;
2162         u64 target;
2163         u64 nread = 0;
2164         struct extent_buffer *eb;
2165         u32 nr;
2166         u32 blocksize;
2167         u32 nscan = 0;
2168
2169         if (level != 1)
2170                 return;
2171
2172         if (!path->nodes[level])
2173                 return;
2174
2175         node = path->nodes[level];
2176
2177         search = btrfs_node_blockptr(node, slot);
2178         blocksize = fs_info->nodesize;
2179         eb = find_extent_buffer(fs_info, search);
2180         if (eb) {
2181                 free_extent_buffer(eb);
2182                 return;
2183         }
2184
2185         target = search;
2186
2187         nritems = btrfs_header_nritems(node);
2188         nr = slot;
2189
2190         while (1) {
2191                 if (path->reada == READA_BACK) {
2192                         if (nr == 0)
2193                                 break;
2194                         nr--;
2195                 } else if (path->reada == READA_FORWARD) {
2196                         nr++;
2197                         if (nr >= nritems)
2198                                 break;
2199                 }
2200                 if (path->reada == READA_BACK && objectid) {
2201                         btrfs_node_key(node, &disk_key, nr);
2202                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2203                                 break;
2204                 }
2205                 search = btrfs_node_blockptr(node, nr);
2206                 if ((search <= target && target - search <= 65536) ||
2207                     (search > target && search - target <= 65536)) {
2208                         readahead_tree_block(fs_info, search);
2209                         nread += blocksize;
2210                 }
2211                 nscan++;
2212                 if ((nread > 65536 || nscan > 32))
2213                         break;
2214         }
2215 }
2216
2217 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2218                                        struct btrfs_path *path, int level)
2219 {
2220         int slot;
2221         int nritems;
2222         struct extent_buffer *parent;
2223         struct extent_buffer *eb;
2224         u64 gen;
2225         u64 block1 = 0;
2226         u64 block2 = 0;
2227
2228         parent = path->nodes[level + 1];
2229         if (!parent)
2230                 return;
2231
2232         nritems = btrfs_header_nritems(parent);
2233         slot = path->slots[level + 1];
2234
2235         if (slot > 0) {
2236                 block1 = btrfs_node_blockptr(parent, slot - 1);
2237                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2238                 eb = find_extent_buffer(fs_info, block1);
2239                 /*
2240                  * if we get -eagain from btrfs_buffer_uptodate, we
2241                  * don't want to return eagain here.  That will loop
2242                  * forever
2243                  */
2244                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2245                         block1 = 0;
2246                 free_extent_buffer(eb);
2247         }
2248         if (slot + 1 < nritems) {
2249                 block2 = btrfs_node_blockptr(parent, slot + 1);
2250                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2251                 eb = find_extent_buffer(fs_info, block2);
2252                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2253                         block2 = 0;
2254                 free_extent_buffer(eb);
2255         }
2256
2257         if (block1)
2258                 readahead_tree_block(fs_info, block1);
2259         if (block2)
2260                 readahead_tree_block(fs_info, block2);
2261 }
2262
2263
2264 /*
2265  * when we walk down the tree, it is usually safe to unlock the higher layers
2266  * in the tree.  The exceptions are when our path goes through slot 0, because
2267  * operations on the tree might require changing key pointers higher up in the
2268  * tree.
2269  *
2270  * callers might also have set path->keep_locks, which tells this code to keep
2271  * the lock if the path points to the last slot in the block.  This is part of
2272  * walking through the tree, and selecting the next slot in the higher block.
2273  *
2274  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2275  * if lowest_unlock is 1, level 0 won't be unlocked
2276  */
2277 static noinline void unlock_up(struct btrfs_path *path, int level,
2278                                int lowest_unlock, int min_write_lock_level,
2279                                int *write_lock_level)
2280 {
2281         int i;
2282         int skip_level = level;
2283         int no_skips = 0;
2284         struct extent_buffer *t;
2285
2286         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2287                 if (!path->nodes[i])
2288                         break;
2289                 if (!path->locks[i])
2290                         break;
2291                 if (!no_skips && path->slots[i] == 0) {
2292                         skip_level = i + 1;
2293                         continue;
2294                 }
2295                 if (!no_skips && path->keep_locks) {
2296                         u32 nritems;
2297                         t = path->nodes[i];
2298                         nritems = btrfs_header_nritems(t);
2299                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2300                                 skip_level = i + 1;
2301                                 continue;
2302                         }
2303                 }
2304                 if (skip_level < i && i >= lowest_unlock)
2305                         no_skips = 1;
2306
2307                 t = path->nodes[i];
2308                 if (i >= lowest_unlock && i > skip_level) {
2309                         btrfs_tree_unlock_rw(t, path->locks[i]);
2310                         path->locks[i] = 0;
2311                         if (write_lock_level &&
2312                             i > min_write_lock_level &&
2313                             i <= *write_lock_level) {
2314                                 *write_lock_level = i - 1;
2315                         }
2316                 }
2317         }
2318 }
2319
2320 /*
2321  * This releases any locks held in the path starting at level and
2322  * going all the way up to the root.
2323  *
2324  * btrfs_search_slot will keep the lock held on higher nodes in a few
2325  * corner cases, such as COW of the block at slot zero in the node.  This
2326  * ignores those rules, and it should only be called when there are no
2327  * more updates to be done higher up in the tree.
2328  */
2329 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2330 {
2331         int i;
2332
2333         if (path->keep_locks)
2334                 return;
2335
2336         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2337                 if (!path->nodes[i])
2338                         continue;
2339                 if (!path->locks[i])
2340                         continue;
2341                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2342                 path->locks[i] = 0;
2343         }
2344 }
2345
2346 /*
2347  * helper function for btrfs_search_slot.  The goal is to find a block
2348  * in cache without setting the path to blocking.  If we find the block
2349  * we return zero and the path is unchanged.
2350  *
2351  * If we can't find the block, we set the path blocking and do some
2352  * reada.  -EAGAIN is returned and the search must be repeated.
2353  */
2354 static int
2355 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2356                       struct extent_buffer **eb_ret, int level, int slot,
2357                       const struct btrfs_key *key)
2358 {
2359         struct btrfs_fs_info *fs_info = root->fs_info;
2360         u64 blocknr;
2361         u64 gen;
2362         struct extent_buffer *b = *eb_ret;
2363         struct extent_buffer *tmp;
2364         struct btrfs_key first_key;
2365         int ret;
2366         int parent_level;
2367
2368         blocknr = btrfs_node_blockptr(b, slot);
2369         gen = btrfs_node_ptr_generation(b, slot);
2370         parent_level = btrfs_header_level(b);
2371         btrfs_node_key_to_cpu(b, &first_key, slot);
2372
2373         tmp = find_extent_buffer(fs_info, blocknr);
2374         if (tmp) {
2375                 /* first we do an atomic uptodate check */
2376                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2377                         *eb_ret = tmp;
2378                         return 0;
2379                 }
2380
2381                 /* the pages were up to date, but we failed
2382                  * the generation number check.  Do a full
2383                  * read for the generation number that is correct.
2384                  * We must do this without dropping locks so
2385                  * we can trust our generation number
2386                  */
2387                 btrfs_set_path_blocking(p);
2388
2389                 /* now we're allowed to do a blocking uptodate check */
2390                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2391                 if (!ret) {
2392                         *eb_ret = tmp;
2393                         return 0;
2394                 }
2395                 free_extent_buffer(tmp);
2396                 btrfs_release_path(p);
2397                 return -EIO;
2398         }
2399
2400         /*
2401          * reduce lock contention at high levels
2402          * of the btree by dropping locks before
2403          * we read.  Don't release the lock on the current
2404          * level because we need to walk this node to figure
2405          * out which blocks to read.
2406          */
2407         btrfs_unlock_up_safe(p, level + 1);
2408         btrfs_set_path_blocking(p);
2409
2410         if (p->reada != READA_NONE)
2411                 reada_for_search(fs_info, p, level, slot, key->objectid);
2412
2413         ret = -EAGAIN;
2414         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2415                               &first_key);
2416         if (!IS_ERR(tmp)) {
2417                 /*
2418                  * If the read above didn't mark this buffer up to date,
2419                  * it will never end up being up to date.  Set ret to EIO now
2420                  * and give up so that our caller doesn't loop forever
2421                  * on our EAGAINs.
2422                  */
2423                 if (!extent_buffer_uptodate(tmp))
2424                         ret = -EIO;
2425                 free_extent_buffer(tmp);
2426         } else {
2427                 ret = PTR_ERR(tmp);
2428         }
2429
2430         btrfs_release_path(p);
2431         return ret;
2432 }
2433
2434 /*
2435  * helper function for btrfs_search_slot.  This does all of the checks
2436  * for node-level blocks and does any balancing required based on
2437  * the ins_len.
2438  *
2439  * If no extra work was required, zero is returned.  If we had to
2440  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2441  * start over
2442  */
2443 static int
2444 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2445                        struct btrfs_root *root, struct btrfs_path *p,
2446                        struct extent_buffer *b, int level, int ins_len,
2447                        int *write_lock_level)
2448 {
2449         struct btrfs_fs_info *fs_info = root->fs_info;
2450         int ret;
2451
2452         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2453             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2454                 int sret;
2455
2456                 if (*write_lock_level < level + 1) {
2457                         *write_lock_level = level + 1;
2458                         btrfs_release_path(p);
2459                         goto again;
2460                 }
2461
2462                 btrfs_set_path_blocking(p);
2463                 reada_for_balance(fs_info, p, level);
2464                 sret = split_node(trans, root, p, level);
2465
2466                 BUG_ON(sret > 0);
2467                 if (sret) {
2468                         ret = sret;
2469                         goto done;
2470                 }
2471                 b = p->nodes[level];
2472         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2473                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2474                 int sret;
2475
2476                 if (*write_lock_level < level + 1) {
2477                         *write_lock_level = level + 1;
2478                         btrfs_release_path(p);
2479                         goto again;
2480                 }
2481
2482                 btrfs_set_path_blocking(p);
2483                 reada_for_balance(fs_info, p, level);
2484                 sret = balance_level(trans, root, p, level);
2485
2486                 if (sret) {
2487                         ret = sret;
2488                         goto done;
2489                 }
2490                 b = p->nodes[level];
2491                 if (!b) {
2492                         btrfs_release_path(p);
2493                         goto again;
2494                 }
2495                 BUG_ON(btrfs_header_nritems(b) == 1);
2496         }
2497         return 0;
2498
2499 again:
2500         ret = -EAGAIN;
2501 done:
2502         return ret;
2503 }
2504
2505 static void key_search_validate(struct extent_buffer *b,
2506                                 const struct btrfs_key *key,
2507                                 int level)
2508 {
2509 #ifdef CONFIG_BTRFS_ASSERT
2510         struct btrfs_disk_key disk_key;
2511
2512         btrfs_cpu_key_to_disk(&disk_key, key);
2513
2514         if (level == 0)
2515                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2516                     offsetof(struct btrfs_leaf, items[0].key),
2517                     sizeof(disk_key)));
2518         else
2519                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2520                     offsetof(struct btrfs_node, ptrs[0].key),
2521                     sizeof(disk_key)));
2522 #endif
2523 }
2524
2525 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2526                       int level, int *prev_cmp, int *slot)
2527 {
2528         if (*prev_cmp != 0) {
2529                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2530                 return *prev_cmp;
2531         }
2532
2533         key_search_validate(b, key, level);
2534         *slot = 0;
2535
2536         return 0;
2537 }
2538
2539 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2540                 u64 iobjectid, u64 ioff, u8 key_type,
2541                 struct btrfs_key *found_key)
2542 {
2543         int ret;
2544         struct btrfs_key key;
2545         struct extent_buffer *eb;
2546
2547         ASSERT(path);
2548         ASSERT(found_key);
2549
2550         key.type = key_type;
2551         key.objectid = iobjectid;
2552         key.offset = ioff;
2553
2554         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2555         if (ret < 0)
2556                 return ret;
2557
2558         eb = path->nodes[0];
2559         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2560                 ret = btrfs_next_leaf(fs_root, path);
2561                 if (ret)
2562                         return ret;
2563                 eb = path->nodes[0];
2564         }
2565
2566         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2567         if (found_key->type != key.type ||
2568                         found_key->objectid != key.objectid)
2569                 return 1;
2570
2571         return 0;
2572 }
2573
2574 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2575                                                         struct btrfs_path *p,
2576                                                         int write_lock_level)
2577 {
2578         struct btrfs_fs_info *fs_info = root->fs_info;
2579         struct extent_buffer *b;
2580         int root_lock;
2581         int level = 0;
2582
2583         /* We try very hard to do read locks on the root */
2584         root_lock = BTRFS_READ_LOCK;
2585
2586         if (p->search_commit_root) {
2587                 /* The commit roots are read only so we always do read locks */
2588                 if (p->need_commit_sem)
2589                         down_read(&fs_info->commit_root_sem);
2590                 b = root->commit_root;
2591                 extent_buffer_get(b);
2592                 level = btrfs_header_level(b);
2593                 if (p->need_commit_sem)
2594                         up_read(&fs_info->commit_root_sem);
2595                 /*
2596                  * Ensure that all callers have set skip_locking when
2597                  * p->search_commit_root = 1.
2598                  */
2599                 ASSERT(p->skip_locking == 1);
2600
2601                 goto out;
2602         }
2603
2604         if (p->skip_locking) {
2605                 b = btrfs_root_node(root);
2606                 level = btrfs_header_level(b);
2607                 goto out;
2608         }
2609
2610         /*
2611          * If the level is set to maximum, we can skip trying to get the read
2612          * lock.
2613          */
2614         if (write_lock_level < BTRFS_MAX_LEVEL) {
2615                 /*
2616                  * We don't know the level of the root node until we actually
2617                  * have it read locked
2618                  */
2619                 b = btrfs_read_lock_root_node(root);
2620                 level = btrfs_header_level(b);
2621                 if (level > write_lock_level)
2622                         goto out;
2623
2624                 /* Whoops, must trade for write lock */
2625                 btrfs_tree_read_unlock(b);
2626                 free_extent_buffer(b);
2627         }
2628
2629         b = btrfs_lock_root_node(root);
2630         root_lock = BTRFS_WRITE_LOCK;
2631
2632         /* The level might have changed, check again */
2633         level = btrfs_header_level(b);
2634
2635 out:
2636         p->nodes[level] = b;
2637         if (!p->skip_locking)
2638                 p->locks[level] = root_lock;
2639         /*
2640          * Callers are responsible for dropping b's references.
2641          */
2642         return b;
2643 }
2644
2645
2646 /*
2647  * btrfs_search_slot - look for a key in a tree and perform necessary
2648  * modifications to preserve tree invariants.
2649  *
2650  * @trans:      Handle of transaction, used when modifying the tree
2651  * @p:          Holds all btree nodes along the search path
2652  * @root:       The root node of the tree
2653  * @key:        The key we are looking for
2654  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2655  *              deletions it's -1. 0 for plain searches
2656  * @cow:        boolean should CoW operations be performed. Must always be 1
2657  *              when modifying the tree.
2658  *
2659  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2660  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2661  *
2662  * If @key is found, 0 is returned and you can find the item in the leaf level
2663  * of the path (level 0)
2664  *
2665  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2666  * points to the slot where it should be inserted
2667  *
2668  * If an error is encountered while searching the tree a negative error number
2669  * is returned
2670  */
2671 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2672                       const struct btrfs_key *key, struct btrfs_path *p,
2673                       int ins_len, int cow)
2674 {
2675         struct btrfs_fs_info *fs_info = root->fs_info;
2676         struct extent_buffer *b;
2677         int slot;
2678         int ret;
2679         int err;
2680         int level;
2681         int lowest_unlock = 1;
2682         /* everything at write_lock_level or lower must be write locked */
2683         int write_lock_level = 0;
2684         u8 lowest_level = 0;
2685         int min_write_lock_level;
2686         int prev_cmp;
2687
2688         lowest_level = p->lowest_level;
2689         WARN_ON(lowest_level && ins_len > 0);
2690         WARN_ON(p->nodes[0] != NULL);
2691         BUG_ON(!cow && ins_len);
2692
2693         if (ins_len < 0) {
2694                 lowest_unlock = 2;
2695
2696                 /* when we are removing items, we might have to go up to level
2697                  * two as we update tree pointers  Make sure we keep write
2698                  * for those levels as well
2699                  */
2700                 write_lock_level = 2;
2701         } else if (ins_len > 0) {
2702                 /*
2703                  * for inserting items, make sure we have a write lock on
2704                  * level 1 so we can update keys
2705                  */
2706                 write_lock_level = 1;
2707         }
2708
2709         if (!cow)
2710                 write_lock_level = -1;
2711
2712         if (cow && (p->keep_locks || p->lowest_level))
2713                 write_lock_level = BTRFS_MAX_LEVEL;
2714
2715         min_write_lock_level = write_lock_level;
2716
2717 again:
2718         prev_cmp = -1;
2719         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2720
2721         while (b) {
2722                 level = btrfs_header_level(b);
2723
2724                 /*
2725                  * setup the path here so we can release it under lock
2726                  * contention with the cow code
2727                  */
2728                 if (cow) {
2729                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2730
2731                         /*
2732                          * if we don't really need to cow this block
2733                          * then we don't want to set the path blocking,
2734                          * so we test it here
2735                          */
2736                         if (!should_cow_block(trans, root, b)) {
2737                                 trans->dirty = true;
2738                                 goto cow_done;
2739                         }
2740
2741                         /*
2742                          * must have write locks on this node and the
2743                          * parent
2744                          */
2745                         if (level > write_lock_level ||
2746                             (level + 1 > write_lock_level &&
2747                             level + 1 < BTRFS_MAX_LEVEL &&
2748                             p->nodes[level + 1])) {
2749                                 write_lock_level = level + 1;
2750                                 btrfs_release_path(p);
2751                                 goto again;
2752                         }
2753
2754                         btrfs_set_path_blocking(p);
2755                         if (last_level)
2756                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2757                                                       &b);
2758                         else
2759                                 err = btrfs_cow_block(trans, root, b,
2760                                                       p->nodes[level + 1],
2761                                                       p->slots[level + 1], &b);
2762                         if (err) {
2763                                 ret = err;
2764                                 goto done;
2765                         }
2766                 }
2767 cow_done:
2768                 p->nodes[level] = b;
2769                 /*
2770                  * Leave path with blocking locks to avoid massive
2771                  * lock context switch, this is made on purpose.
2772                  */
2773
2774                 /*
2775                  * we have a lock on b and as long as we aren't changing
2776                  * the tree, there is no way to for the items in b to change.
2777                  * It is safe to drop the lock on our parent before we
2778                  * go through the expensive btree search on b.
2779                  *
2780                  * If we're inserting or deleting (ins_len != 0), then we might
2781                  * be changing slot zero, which may require changing the parent.
2782                  * So, we can't drop the lock until after we know which slot
2783                  * we're operating on.
2784                  */
2785                 if (!ins_len && !p->keep_locks) {
2786                         int u = level + 1;
2787
2788                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2789                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2790                                 p->locks[u] = 0;
2791                         }
2792                 }
2793
2794                 ret = key_search(b, key, level, &prev_cmp, &slot);
2795                 if (ret < 0)
2796                         goto done;
2797
2798                 if (level != 0) {
2799                         int dec = 0;
2800                         if (ret && slot > 0) {
2801                                 dec = 1;
2802                                 slot -= 1;
2803                         }
2804                         p->slots[level] = slot;
2805                         err = setup_nodes_for_search(trans, root, p, b, level,
2806                                              ins_len, &write_lock_level);
2807                         if (err == -EAGAIN)
2808                                 goto again;
2809                         if (err) {
2810                                 ret = err;
2811                                 goto done;
2812                         }
2813                         b = p->nodes[level];
2814                         slot = p->slots[level];
2815
2816                         /*
2817                          * slot 0 is special, if we change the key
2818                          * we have to update the parent pointer
2819                          * which means we must have a write lock
2820                          * on the parent
2821                          */
2822                         if (slot == 0 && ins_len &&
2823                             write_lock_level < level + 1) {
2824                                 write_lock_level = level + 1;
2825                                 btrfs_release_path(p);
2826                                 goto again;
2827                         }
2828
2829                         unlock_up(p, level, lowest_unlock,
2830                                   min_write_lock_level, &write_lock_level);
2831
2832                         if (level == lowest_level) {
2833                                 if (dec)
2834                                         p->slots[level]++;
2835                                 goto done;
2836                         }
2837
2838                         err = read_block_for_search(root, p, &b, level,
2839                                                     slot, key);
2840                         if (err == -EAGAIN)
2841                                 goto again;
2842                         if (err) {
2843                                 ret = err;
2844                                 goto done;
2845                         }
2846
2847                         if (!p->skip_locking) {
2848                                 level = btrfs_header_level(b);
2849                                 if (level <= write_lock_level) {
2850                                         err = btrfs_try_tree_write_lock(b);
2851                                         if (!err) {
2852                                                 btrfs_set_path_blocking(p);
2853                                                 btrfs_tree_lock(b);
2854                                         }
2855                                         p->locks[level] = BTRFS_WRITE_LOCK;
2856                                 } else {
2857                                         err = btrfs_tree_read_lock_atomic(b);
2858                                         if (!err) {
2859                                                 btrfs_set_path_blocking(p);
2860                                                 btrfs_tree_read_lock(b);
2861                                         }
2862                                         p->locks[level] = BTRFS_READ_LOCK;
2863                                 }
2864                                 p->nodes[level] = b;
2865                         }
2866                 } else {
2867                         p->slots[level] = slot;
2868                         if (ins_len > 0 &&
2869                             btrfs_leaf_free_space(fs_info, b) < ins_len) {
2870                                 if (write_lock_level < 1) {
2871                                         write_lock_level = 1;
2872                                         btrfs_release_path(p);
2873                                         goto again;
2874                                 }
2875
2876                                 btrfs_set_path_blocking(p);
2877                                 err = split_leaf(trans, root, key,
2878                                                  p, ins_len, ret == 0);
2879
2880                                 BUG_ON(err > 0);
2881                                 if (err) {
2882                                         ret = err;
2883                                         goto done;
2884                                 }
2885                         }
2886                         if (!p->search_for_split)
2887                                 unlock_up(p, level, lowest_unlock,
2888                                           min_write_lock_level, NULL);
2889                         goto done;
2890                 }
2891         }
2892         ret = 1;
2893 done:
2894         /*
2895          * we don't really know what they plan on doing with the path
2896          * from here on, so for now just mark it as blocking
2897          */
2898         if (!p->leave_spinning)
2899                 btrfs_set_path_blocking(p);
2900         if (ret < 0 && !p->skip_release_on_error)
2901                 btrfs_release_path(p);
2902         return ret;
2903 }
2904
2905 /*
2906  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2907  * current state of the tree together with the operations recorded in the tree
2908  * modification log to search for the key in a previous version of this tree, as
2909  * denoted by the time_seq parameter.
2910  *
2911  * Naturally, there is no support for insert, delete or cow operations.
2912  *
2913  * The resulting path and return value will be set up as if we called
2914  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2915  */
2916 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2917                           struct btrfs_path *p, u64 time_seq)
2918 {
2919         struct btrfs_fs_info *fs_info = root->fs_info;
2920         struct extent_buffer *b;
2921         int slot;
2922         int ret;
2923         int err;
2924         int level;
2925         int lowest_unlock = 1;
2926         u8 lowest_level = 0;
2927         int prev_cmp = -1;
2928
2929         lowest_level = p->lowest_level;
2930         WARN_ON(p->nodes[0] != NULL);
2931
2932         if (p->search_commit_root) {
2933                 BUG_ON(time_seq);
2934                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2935         }
2936
2937 again:
2938         b = get_old_root(root, time_seq);
2939         if (!b) {
2940                 ret = -EIO;
2941                 goto done;
2942         }
2943         level = btrfs_header_level(b);
2944         p->locks[level] = BTRFS_READ_LOCK;
2945
2946         while (b) {
2947                 level = btrfs_header_level(b);
2948                 p->nodes[level] = b;
2949
2950                 /*
2951                  * we have a lock on b and as long as we aren't changing
2952                  * the tree, there is no way to for the items in b to change.
2953                  * It is safe to drop the lock on our parent before we
2954                  * go through the expensive btree search on b.
2955                  */
2956                 btrfs_unlock_up_safe(p, level + 1);
2957
2958                 /*
2959                  * Since we can unwind ebs we want to do a real search every
2960                  * time.
2961                  */
2962                 prev_cmp = -1;
2963                 ret = key_search(b, key, level, &prev_cmp, &slot);
2964
2965                 if (level != 0) {
2966                         int dec = 0;
2967                         if (ret && slot > 0) {
2968                                 dec = 1;
2969                                 slot -= 1;
2970                         }
2971                         p->slots[level] = slot;
2972                         unlock_up(p, level, lowest_unlock, 0, NULL);
2973
2974                         if (level == lowest_level) {
2975                                 if (dec)
2976                                         p->slots[level]++;
2977                                 goto done;
2978                         }
2979
2980                         err = read_block_for_search(root, p, &b, level,
2981                                                     slot, key);
2982                         if (err == -EAGAIN)
2983                                 goto again;
2984                         if (err) {
2985                                 ret = err;
2986                                 goto done;
2987                         }
2988
2989                         level = btrfs_header_level(b);
2990                         err = btrfs_tree_read_lock_atomic(b);
2991                         if (!err) {
2992                                 btrfs_set_path_blocking(p);
2993                                 btrfs_tree_read_lock(b);
2994                         }
2995                         b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2996                         if (!b) {
2997                                 ret = -ENOMEM;
2998                                 goto done;
2999                         }
3000                         p->locks[level] = BTRFS_READ_LOCK;
3001                         p->nodes[level] = b;
3002                 } else {
3003                         p->slots[level] = slot;
3004                         unlock_up(p, level, lowest_unlock, 0, NULL);
3005                         goto done;
3006                 }
3007         }
3008         ret = 1;
3009 done:
3010         if (!p->leave_spinning)
3011                 btrfs_set_path_blocking(p);
3012         if (ret < 0)
3013                 btrfs_release_path(p);
3014
3015         return ret;
3016 }
3017
3018 /*
3019  * helper to use instead of search slot if no exact match is needed but
3020  * instead the next or previous item should be returned.
3021  * When find_higher is true, the next higher item is returned, the next lower
3022  * otherwise.
3023  * When return_any and find_higher are both true, and no higher item is found,
3024  * return the next lower instead.
3025  * When return_any is true and find_higher is false, and no lower item is found,
3026  * return the next higher instead.
3027  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3028  * < 0 on error
3029  */
3030 int btrfs_search_slot_for_read(struct btrfs_root *root,
3031                                const struct btrfs_key *key,
3032                                struct btrfs_path *p, int find_higher,
3033                                int return_any)
3034 {
3035         int ret;
3036         struct extent_buffer *leaf;
3037
3038 again:
3039         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3040         if (ret <= 0)
3041                 return ret;
3042         /*
3043          * a return value of 1 means the path is at the position where the
3044          * item should be inserted. Normally this is the next bigger item,
3045          * but in case the previous item is the last in a leaf, path points
3046          * to the first free slot in the previous leaf, i.e. at an invalid
3047          * item.
3048          */
3049         leaf = p->nodes[0];
3050
3051         if (find_higher) {
3052                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3053                         ret = btrfs_next_leaf(root, p);
3054                         if (ret <= 0)
3055                                 return ret;
3056                         if (!return_any)
3057                                 return 1;
3058                         /*
3059                          * no higher item found, return the next
3060                          * lower instead
3061                          */
3062                         return_any = 0;
3063                         find_higher = 0;
3064                         btrfs_release_path(p);
3065                         goto again;
3066                 }
3067         } else {
3068                 if (p->slots[0] == 0) {
3069                         ret = btrfs_prev_leaf(root, p);
3070                         if (ret < 0)
3071                                 return ret;
3072                         if (!ret) {
3073                                 leaf = p->nodes[0];
3074                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3075                                         p->slots[0]--;
3076                                 return 0;
3077                         }
3078                         if (!return_any)
3079                                 return 1;
3080                         /*
3081                          * no lower item found, return the next
3082                          * higher instead
3083                          */
3084                         return_any = 0;
3085                         find_higher = 1;
3086                         btrfs_release_path(p);
3087                         goto again;
3088                 } else {
3089                         --p->slots[0];
3090                 }
3091         }
3092         return 0;
3093 }
3094
3095 /*
3096  * adjust the pointers going up the tree, starting at level
3097  * making sure the right key of each node is points to 'key'.
3098  * This is used after shifting pointers to the left, so it stops
3099  * fixing up pointers when a given leaf/node is not in slot 0 of the
3100  * higher levels
3101  *
3102  */
3103 static void fixup_low_keys(struct btrfs_path *path,
3104                            struct btrfs_disk_key *key, int level)
3105 {
3106         int i;
3107         struct extent_buffer *t;
3108         int ret;
3109
3110         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3111                 int tslot = path->slots[i];
3112
3113                 if (!path->nodes[i])
3114                         break;
3115                 t = path->nodes[i];
3116                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3117                                 GFP_ATOMIC);
3118                 BUG_ON(ret < 0);
3119                 btrfs_set_node_key(t, key, tslot);
3120                 btrfs_mark_buffer_dirty(path->nodes[i]);
3121                 if (tslot != 0)
3122                         break;
3123         }
3124 }
3125
3126 /*
3127  * update item key.
3128  *
3129  * This function isn't completely safe. It's the caller's responsibility
3130  * that the new key won't break the order
3131  */
3132 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3133                              struct btrfs_path *path,
3134                              const struct btrfs_key *new_key)
3135 {
3136         struct btrfs_disk_key disk_key;
3137         struct extent_buffer *eb;
3138         int slot;
3139
3140         eb = path->nodes[0];
3141         slot = path->slots[0];
3142         if (slot > 0) {
3143                 btrfs_item_key(eb, &disk_key, slot - 1);
3144                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3145         }
3146         if (slot < btrfs_header_nritems(eb) - 1) {
3147                 btrfs_item_key(eb, &disk_key, slot + 1);
3148                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3149         }
3150
3151         btrfs_cpu_key_to_disk(&disk_key, new_key);
3152         btrfs_set_item_key(eb, &disk_key, slot);
3153         btrfs_mark_buffer_dirty(eb);
3154         if (slot == 0)
3155                 fixup_low_keys(path, &disk_key, 1);
3156 }
3157
3158 /*
3159  * try to push data from one node into the next node left in the
3160  * tree.
3161  *
3162  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3163  * error, and > 0 if there was no room in the left hand block.
3164  */
3165 static int push_node_left(struct btrfs_trans_handle *trans,
3166                           struct btrfs_fs_info *fs_info,
3167                           struct extent_buffer *dst,
3168                           struct extent_buffer *src, int empty)
3169 {
3170         int push_items = 0;
3171         int src_nritems;
3172         int dst_nritems;
3173         int ret = 0;
3174
3175         src_nritems = btrfs_header_nritems(src);
3176         dst_nritems = btrfs_header_nritems(dst);
3177         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3178         WARN_ON(btrfs_header_generation(src) != trans->transid);
3179         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3180
3181         if (!empty && src_nritems <= 8)
3182                 return 1;
3183
3184         if (push_items <= 0)
3185                 return 1;
3186
3187         if (empty) {
3188                 push_items = min(src_nritems, push_items);
3189                 if (push_items < src_nritems) {
3190                         /* leave at least 8 pointers in the node if
3191                          * we aren't going to empty it
3192                          */
3193                         if (src_nritems - push_items < 8) {
3194                                 if (push_items <= 8)
3195                                         return 1;
3196                                 push_items -= 8;
3197                         }
3198                 }
3199         } else
3200                 push_items = min(src_nritems - 8, push_items);
3201
3202         ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3203                                    push_items);
3204         if (ret) {
3205                 btrfs_abort_transaction(trans, ret);
3206                 return ret;
3207         }
3208         copy_extent_buffer(dst, src,
3209                            btrfs_node_key_ptr_offset(dst_nritems),
3210                            btrfs_node_key_ptr_offset(0),
3211                            push_items * sizeof(struct btrfs_key_ptr));
3212
3213         if (push_items < src_nritems) {
3214                 /*
3215                  * Don't call tree_mod_log_insert_move here, key removal was
3216                  * already fully logged by tree_mod_log_eb_copy above.
3217                  */
3218                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3219                                       btrfs_node_key_ptr_offset(push_items),
3220                                       (src_nritems - push_items) *
3221                                       sizeof(struct btrfs_key_ptr));
3222         }
3223         btrfs_set_header_nritems(src, src_nritems - push_items);
3224         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3225         btrfs_mark_buffer_dirty(src);
3226         btrfs_mark_buffer_dirty(dst);
3227
3228         return ret;
3229 }
3230
3231 /*
3232  * try to push data from one node into the next node right in the
3233  * tree.
3234  *
3235  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3236  * error, and > 0 if there was no room in the right hand block.
3237  *
3238  * this will  only push up to 1/2 the contents of the left node over
3239  */
3240 static int balance_node_right(struct btrfs_trans_handle *trans,
3241                               struct btrfs_fs_info *fs_info,
3242                               struct extent_buffer *dst,
3243                               struct extent_buffer *src)
3244 {
3245         int push_items = 0;
3246         int max_push;
3247         int src_nritems;
3248         int dst_nritems;
3249         int ret = 0;
3250
3251         WARN_ON(btrfs_header_generation(src) != trans->transid);
3252         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3253
3254         src_nritems = btrfs_header_nritems(src);
3255         dst_nritems = btrfs_header_nritems(dst);
3256         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3257         if (push_items <= 0)
3258                 return 1;
3259
3260         if (src_nritems < 4)
3261                 return 1;
3262
3263         max_push = src_nritems / 2 + 1;
3264         /* don't try to empty the node */
3265         if (max_push >= src_nritems)
3266                 return 1;
3267
3268         if (max_push < push_items)
3269                 push_items = max_push;
3270
3271         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3272         BUG_ON(ret < 0);
3273         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3274                                       btrfs_node_key_ptr_offset(0),
3275                                       (dst_nritems) *
3276                                       sizeof(struct btrfs_key_ptr));
3277
3278         ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3279                                    src_nritems - push_items, push_items);
3280         if (ret) {
3281                 btrfs_abort_transaction(trans, ret);
3282                 return ret;
3283         }
3284         copy_extent_buffer(dst, src,
3285                            btrfs_node_key_ptr_offset(0),
3286                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3287                            push_items * sizeof(struct btrfs_key_ptr));
3288
3289         btrfs_set_header_nritems(src, src_nritems - push_items);
3290         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3291
3292         btrfs_mark_buffer_dirty(src);
3293         btrfs_mark_buffer_dirty(dst);
3294
3295         return ret;
3296 }
3297
3298 /*
3299  * helper function to insert a new root level in the tree.
3300  * A new node is allocated, and a single item is inserted to
3301  * point to the existing root
3302  *
3303  * returns zero on success or < 0 on failure.
3304  */
3305 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3306                            struct btrfs_root *root,
3307                            struct btrfs_path *path, int level)
3308 {
3309         struct btrfs_fs_info *fs_info = root->fs_info;
3310         u64 lower_gen;
3311         struct extent_buffer *lower;
3312         struct extent_buffer *c;
3313         struct extent_buffer *old;
3314         struct btrfs_disk_key lower_key;
3315         int ret;
3316
3317         BUG_ON(path->nodes[level]);
3318         BUG_ON(path->nodes[level-1] != root->node);
3319
3320         lower = path->nodes[level-1];
3321         if (level == 1)
3322                 btrfs_item_key(lower, &lower_key, 0);
3323         else
3324                 btrfs_node_key(lower, &lower_key, 0);
3325
3326         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3327                                    &lower_key, level, root->node->start, 0);
3328         if (IS_ERR(c))
3329                 return PTR_ERR(c);
3330
3331         root_add_used(root, fs_info->nodesize);
3332
3333         btrfs_set_header_nritems(c, 1);
3334         btrfs_set_node_key(c, &lower_key, 0);
3335         btrfs_set_node_blockptr(c, 0, lower->start);
3336         lower_gen = btrfs_header_generation(lower);
3337         WARN_ON(lower_gen != trans->transid);
3338
3339         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3340
3341         btrfs_mark_buffer_dirty(c);
3342
3343         old = root->node;
3344         ret = tree_mod_log_insert_root(root->node, c, 0);
3345         BUG_ON(ret < 0);
3346         rcu_assign_pointer(root->node, c);
3347
3348         /* the super has an extra ref to root->node */
3349         free_extent_buffer(old);
3350
3351         add_root_to_dirty_list(root);
3352         extent_buffer_get(c);
3353         path->nodes[level] = c;
3354         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3355         path->slots[level] = 0;
3356         return 0;
3357 }
3358
3359 /*
3360  * worker function to insert a single pointer in a node.
3361  * the node should have enough room for the pointer already
3362  *
3363  * slot and level indicate where you want the key to go, and
3364  * blocknr is the block the key points to.
3365  */
3366 static void insert_ptr(struct btrfs_trans_handle *trans,
3367                        struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3368                        struct btrfs_disk_key *key, u64 bytenr,
3369                        int slot, int level)
3370 {
3371         struct extent_buffer *lower;
3372         int nritems;
3373         int ret;
3374
3375         BUG_ON(!path->nodes[level]);
3376         btrfs_assert_tree_locked(path->nodes[level]);
3377         lower = path->nodes[level];
3378         nritems = btrfs_header_nritems(lower);
3379         BUG_ON(slot > nritems);
3380         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3381         if (slot != nritems) {
3382                 if (level) {
3383                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3384                                         nritems - slot);
3385                         BUG_ON(ret < 0);
3386                 }
3387                 memmove_extent_buffer(lower,
3388                               btrfs_node_key_ptr_offset(slot + 1),
3389                               btrfs_node_key_ptr_offset(slot),
3390                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3391         }
3392         if (level) {
3393                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3394                                 GFP_NOFS);
3395                 BUG_ON(ret < 0);
3396         }
3397         btrfs_set_node_key(lower, key, slot);
3398         btrfs_set_node_blockptr(lower, slot, bytenr);
3399         WARN_ON(trans->transid == 0);
3400         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3401         btrfs_set_header_nritems(lower, nritems + 1);
3402         btrfs_mark_buffer_dirty(lower);
3403 }
3404
3405 /*
3406  * split the node at the specified level in path in two.
3407  * The path is corrected to point to the appropriate node after the split
3408  *
3409  * Before splitting this tries to make some room in the node by pushing
3410  * left and right, if either one works, it returns right away.
3411  *
3412  * returns 0 on success and < 0 on failure
3413  */
3414 static noinline int split_node(struct btrfs_trans_handle *trans,
3415                                struct btrfs_root *root,
3416                                struct btrfs_path *path, int level)
3417 {
3418         struct btrfs_fs_info *fs_info = root->fs_info;
3419         struct extent_buffer *c;
3420         struct extent_buffer *split;
3421         struct btrfs_disk_key disk_key;
3422         int mid;
3423         int ret;
3424         u32 c_nritems;
3425
3426         c = path->nodes[level];
3427         WARN_ON(btrfs_header_generation(c) != trans->transid);
3428         if (c == root->node) {
3429                 /*
3430                  * trying to split the root, lets make a new one
3431                  *
3432                  * tree mod log: We don't log_removal old root in
3433                  * insert_new_root, because that root buffer will be kept as a
3434                  * normal node. We are going to log removal of half of the
3435                  * elements below with tree_mod_log_eb_copy. We're holding a
3436                  * tree lock on the buffer, which is why we cannot race with
3437                  * other tree_mod_log users.
3438                  */
3439                 ret = insert_new_root(trans, root, path, level + 1);
3440                 if (ret)
3441                         return ret;
3442         } else {
3443                 ret = push_nodes_for_insert(trans, root, path, level);
3444                 c = path->nodes[level];
3445                 if (!ret && btrfs_header_nritems(c) <
3446                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3447                         return 0;
3448                 if (ret < 0)
3449                         return ret;
3450         }
3451
3452         c_nritems = btrfs_header_nritems(c);
3453         mid = (c_nritems + 1) / 2;
3454         btrfs_node_key(c, &disk_key, mid);
3455
3456         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3457                         &disk_key, level, c->start, 0);
3458         if (IS_ERR(split))
3459                 return PTR_ERR(split);
3460
3461         root_add_used(root, fs_info->nodesize);
3462         ASSERT(btrfs_header_level(c) == level);
3463
3464         ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3465         if (ret) {
3466                 btrfs_abort_transaction(trans, ret);
3467                 return ret;
3468         }
3469         copy_extent_buffer(split, c,
3470                            btrfs_node_key_ptr_offset(0),
3471                            btrfs_node_key_ptr_offset(mid),
3472                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3473         btrfs_set_header_nritems(split, c_nritems - mid);
3474         btrfs_set_header_nritems(c, mid);
3475         ret = 0;
3476
3477         btrfs_mark_buffer_dirty(c);
3478         btrfs_mark_buffer_dirty(split);
3479
3480         insert_ptr(trans, fs_info, path, &disk_key, split->start,
3481                    path->slots[level + 1] + 1, level + 1);
3482
3483         if (path->slots[level] >= mid) {
3484                 path->slots[level] -= mid;
3485                 btrfs_tree_unlock(c);
3486                 free_extent_buffer(c);
3487                 path->nodes[level] = split;
3488                 path->slots[level + 1] += 1;
3489         } else {
3490                 btrfs_tree_unlock(split);
3491                 free_extent_buffer(split);
3492         }
3493         return ret;
3494 }
3495
3496 /*
3497  * how many bytes are required to store the items in a leaf.  start
3498  * and nr indicate which items in the leaf to check.  This totals up the
3499  * space used both by the item structs and the item data
3500  */
3501 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3502 {
3503         struct btrfs_item *start_item;
3504         struct btrfs_item *end_item;
3505         struct btrfs_map_token token;
3506         int data_len;
3507         int nritems = btrfs_header_nritems(l);
3508         int end = min(nritems, start + nr) - 1;
3509
3510         if (!nr)
3511                 return 0;
3512         btrfs_init_map_token(&token);
3513         start_item = btrfs_item_nr(start);
3514         end_item = btrfs_item_nr(end);
3515         data_len = btrfs_token_item_offset(l, start_item, &token) +
3516                 btrfs_token_item_size(l, start_item, &token);
3517         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3518         data_len += sizeof(struct btrfs_item) * nr;
3519         WARN_ON(data_len < 0);
3520         return data_len;
3521 }
3522
3523 /*
3524  * The space between the end of the leaf items and
3525  * the start of the leaf data.  IOW, how much room
3526  * the leaf has left for both items and data
3527  */
3528 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3529                                    struct extent_buffer *leaf)
3530 {
3531         int nritems = btrfs_header_nritems(leaf);
3532         int ret;
3533
3534         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3535         if (ret < 0) {
3536                 btrfs_crit(fs_info,
3537                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3538                            ret,
3539                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3540                            leaf_space_used(leaf, 0, nritems), nritems);
3541         }
3542         return ret;
3543 }
3544
3545 /*
3546  * min slot controls the lowest index we're willing to push to the
3547  * right.  We'll push up to and including min_slot, but no lower
3548  */
3549 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3550                                       struct btrfs_path *path,
3551                                       int data_size, int empty,
3552                                       struct extent_buffer *right,
3553                                       int free_space, u32 left_nritems,
3554                                       u32 min_slot)
3555 {
3556         struct extent_buffer *left = path->nodes[0];
3557         struct extent_buffer *upper = path->nodes[1];
3558         struct btrfs_map_token token;
3559         struct btrfs_disk_key disk_key;
3560         int slot;
3561         u32 i;
3562         int push_space = 0;
3563         int push_items = 0;
3564         struct btrfs_item *item;
3565         u32 nr;
3566         u32 right_nritems;
3567         u32 data_end;
3568         u32 this_item_size;
3569
3570         btrfs_init_map_token(&token);
3571
3572         if (empty)
3573                 nr = 0;
3574         else
3575                 nr = max_t(u32, 1, min_slot);
3576
3577         if (path->slots[0] >= left_nritems)
3578                 push_space += data_size;
3579
3580         slot = path->slots[1];
3581         i = left_nritems - 1;
3582         while (i >= nr) {
3583                 item = btrfs_item_nr(i);
3584
3585                 if (!empty && push_items > 0) {
3586                         if (path->slots[0] > i)
3587                                 break;
3588                         if (path->slots[0] == i) {
3589                                 int space = btrfs_leaf_free_space(fs_info, left);
3590                                 if (space + push_space * 2 > free_space)
3591                                         break;
3592                         }
3593                 }
3594
3595                 if (path->slots[0] == i)
3596                         push_space += data_size;
3597
3598                 this_item_size = btrfs_item_size(left, item);
3599                 if (this_item_size + sizeof(*item) + push_space > free_space)
3600                         break;
3601
3602                 push_items++;
3603                 push_space += this_item_size + sizeof(*item);
3604                 if (i == 0)
3605                         break;
3606                 i--;
3607         }
3608
3609         if (push_items == 0)
3610                 goto out_unlock;
3611
3612         WARN_ON(!empty && push_items == left_nritems);
3613
3614         /* push left to right */
3615         right_nritems = btrfs_header_nritems(right);
3616
3617         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3618         push_space -= leaf_data_end(fs_info, left);
3619
3620         /* make room in the right data area */
3621         data_end = leaf_data_end(fs_info, right);
3622         memmove_extent_buffer(right,
3623                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3624                               BTRFS_LEAF_DATA_OFFSET + data_end,
3625                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3626
3627         /* copy from the left data area */
3628         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3629                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3630                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3631                      push_space);
3632
3633         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3634                               btrfs_item_nr_offset(0),
3635                               right_nritems * sizeof(struct btrfs_item));
3636
3637         /* copy the items from left to right */