3fd44835b3869effbfe6ced6c3b8b373617698e5
[sfrench/cifs-2.6.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15
16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
17                       *root, struct btrfs_path *path, int level);
18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
19                       const struct btrfs_key *ins_key, struct btrfs_path *path,
20                       int data_size, int extend);
21 static int push_node_left(struct btrfs_trans_handle *trans,
22                           struct btrfs_fs_info *fs_info,
23                           struct extent_buffer *dst,
24                           struct extent_buffer *src, int empty);
25 static int balance_node_right(struct btrfs_trans_handle *trans,
26                               struct btrfs_fs_info *fs_info,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 struct btrfs_path *btrfs_alloc_path(void)
33 {
34         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35 }
36
37 /*
38  * set all locked nodes in the path to blocking locks.  This should
39  * be done before scheduling
40  */
41 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42 {
43         int i;
44         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45                 if (!p->nodes[i] || !p->locks[i])
46                         continue;
47                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48                 if (p->locks[i] == BTRFS_READ_LOCK)
49                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
51                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52         }
53 }
54
55 /*
56  * reset all the locked nodes in the patch to spinning locks.
57  *
58  * held is used to keep lockdep happy, when lockdep is enabled
59  * we set held to a blocking lock before we go around and
60  * retake all the spinlocks in the path.  You can safely use NULL
61  * for held
62  */
63 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
64                                         struct extent_buffer *held, int held_rw)
65 {
66         int i;
67
68         if (held) {
69                 btrfs_set_lock_blocking_rw(held, held_rw);
70                 if (held_rw == BTRFS_WRITE_LOCK)
71                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
72                 else if (held_rw == BTRFS_READ_LOCK)
73                         held_rw = BTRFS_READ_LOCK_BLOCKING;
74         }
75         btrfs_set_path_blocking(p);
76
77         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
78                 if (p->nodes[i] && p->locks[i]) {
79                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
80                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
81                                 p->locks[i] = BTRFS_WRITE_LOCK;
82                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
83                                 p->locks[i] = BTRFS_READ_LOCK;
84                 }
85         }
86
87         if (held)
88                 btrfs_clear_lock_blocking_rw(held, held_rw);
89 }
90
91 /* this also releases the path */
92 void btrfs_free_path(struct btrfs_path *p)
93 {
94         if (!p)
95                 return;
96         btrfs_release_path(p);
97         kmem_cache_free(btrfs_path_cachep, p);
98 }
99
100 /*
101  * path release drops references on the extent buffers in the path
102  * and it drops any locks held by this path
103  *
104  * It is safe to call this on paths that no locks or extent buffers held.
105  */
106 noinline void btrfs_release_path(struct btrfs_path *p)
107 {
108         int i;
109
110         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
111                 p->slots[i] = 0;
112                 if (!p->nodes[i])
113                         continue;
114                 if (p->locks[i]) {
115                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
116                         p->locks[i] = 0;
117                 }
118                 free_extent_buffer(p->nodes[i]);
119                 p->nodes[i] = NULL;
120         }
121 }
122
123 /*
124  * safely gets a reference on the root node of a tree.  A lock
125  * is not taken, so a concurrent writer may put a different node
126  * at the root of the tree.  See btrfs_lock_root_node for the
127  * looping required.
128  *
129  * The extent buffer returned by this has a reference taken, so
130  * it won't disappear.  It may stop being the root of the tree
131  * at any time because there are no locks held.
132  */
133 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
134 {
135         struct extent_buffer *eb;
136
137         while (1) {
138                 rcu_read_lock();
139                 eb = rcu_dereference(root->node);
140
141                 /*
142                  * RCU really hurts here, we could free up the root node because
143                  * it was COWed but we may not get the new root node yet so do
144                  * the inc_not_zero dance and if it doesn't work then
145                  * synchronize_rcu and try again.
146                  */
147                 if (atomic_inc_not_zero(&eb->refs)) {
148                         rcu_read_unlock();
149                         break;
150                 }
151                 rcu_read_unlock();
152                 synchronize_rcu();
153         }
154         return eb;
155 }
156
157 /* loop around taking references on and locking the root node of the
158  * tree until you end up with a lock on the root.  A locked buffer
159  * is returned, with a reference held.
160  */
161 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
162 {
163         struct extent_buffer *eb;
164
165         while (1) {
166                 eb = btrfs_root_node(root);
167                 btrfs_tree_lock(eb);
168                 if (eb == root->node)
169                         break;
170                 btrfs_tree_unlock(eb);
171                 free_extent_buffer(eb);
172         }
173         return eb;
174 }
175
176 /* loop around taking references on and locking the root node of the
177  * tree until you end up with a lock on the root.  A locked buffer
178  * is returned, with a reference held.
179  */
180 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
181 {
182         struct extent_buffer *eb;
183
184         while (1) {
185                 eb = btrfs_root_node(root);
186                 btrfs_tree_read_lock(eb);
187                 if (eb == root->node)
188                         break;
189                 btrfs_tree_read_unlock(eb);
190                 free_extent_buffer(eb);
191         }
192         return eb;
193 }
194
195 /* cowonly root (everything not a reference counted cow subvolume), just get
196  * put onto a simple dirty list.  transaction.c walks this to make sure they
197  * get properly updated on disk.
198  */
199 static void add_root_to_dirty_list(struct btrfs_root *root)
200 {
201         struct btrfs_fs_info *fs_info = root->fs_info;
202
203         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
204             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
205                 return;
206
207         spin_lock(&fs_info->trans_lock);
208         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
209                 /* Want the extent tree to be the last on the list */
210                 if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
211                         list_move_tail(&root->dirty_list,
212                                        &fs_info->dirty_cowonly_roots);
213                 else
214                         list_move(&root->dirty_list,
215                                   &fs_info->dirty_cowonly_roots);
216         }
217         spin_unlock(&fs_info->trans_lock);
218 }
219
220 /*
221  * used by snapshot creation to make a copy of a root for a tree with
222  * a given objectid.  The buffer with the new root node is returned in
223  * cow_ret, and this func returns zero on success or a negative error code.
224  */
225 int btrfs_copy_root(struct btrfs_trans_handle *trans,
226                       struct btrfs_root *root,
227                       struct extent_buffer *buf,
228                       struct extent_buffer **cow_ret, u64 new_root_objectid)
229 {
230         struct btrfs_fs_info *fs_info = root->fs_info;
231         struct extent_buffer *cow;
232         int ret = 0;
233         int level;
234         struct btrfs_disk_key disk_key;
235
236         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
237                 trans->transid != fs_info->running_transaction->transid);
238         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
239                 trans->transid != root->last_trans);
240
241         level = btrfs_header_level(buf);
242         if (level == 0)
243                 btrfs_item_key(buf, &disk_key, 0);
244         else
245                 btrfs_node_key(buf, &disk_key, 0);
246
247         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
248                         &disk_key, level, buf->start, 0);
249         if (IS_ERR(cow))
250                 return PTR_ERR(cow);
251
252         copy_extent_buffer_full(cow, buf);
253         btrfs_set_header_bytenr(cow, cow->start);
254         btrfs_set_header_generation(cow, trans->transid);
255         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
256         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
257                                      BTRFS_HEADER_FLAG_RELOC);
258         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
259                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
260         else
261                 btrfs_set_header_owner(cow, new_root_objectid);
262
263         write_extent_buffer_fsid(cow, fs_info->fsid);
264
265         WARN_ON(btrfs_header_generation(buf) > trans->transid);
266         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
267                 ret = btrfs_inc_ref(trans, root, cow, 1);
268         else
269                 ret = btrfs_inc_ref(trans, root, cow, 0);
270
271         if (ret)
272                 return ret;
273
274         btrfs_mark_buffer_dirty(cow);
275         *cow_ret = cow;
276         return 0;
277 }
278
279 enum mod_log_op {
280         MOD_LOG_KEY_REPLACE,
281         MOD_LOG_KEY_ADD,
282         MOD_LOG_KEY_REMOVE,
283         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
284         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
285         MOD_LOG_MOVE_KEYS,
286         MOD_LOG_ROOT_REPLACE,
287 };
288
289 struct tree_mod_root {
290         u64 logical;
291         u8 level;
292 };
293
294 struct tree_mod_elem {
295         struct rb_node node;
296         u64 logical;
297         u64 seq;
298         enum mod_log_op op;
299
300         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
301         int slot;
302
303         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
304         u64 generation;
305
306         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
307         struct btrfs_disk_key key;
308         u64 blockptr;
309
310         /* this is used for op == MOD_LOG_MOVE_KEYS */
311         struct {
312                 int dst_slot;
313                 int nr_items;
314         } move;
315
316         /* this is used for op == MOD_LOG_ROOT_REPLACE */
317         struct tree_mod_root old_root;
318 };
319
320 /*
321  * Pull a new tree mod seq number for our operation.
322  */
323 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
324 {
325         return atomic64_inc_return(&fs_info->tree_mod_seq);
326 }
327
328 /*
329  * This adds a new blocker to the tree mod log's blocker list if the @elem
330  * passed does not already have a sequence number set. So when a caller expects
331  * to record tree modifications, it should ensure to set elem->seq to zero
332  * before calling btrfs_get_tree_mod_seq.
333  * Returns a fresh, unused tree log modification sequence number, even if no new
334  * blocker was added.
335  */
336 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
337                            struct seq_list *elem)
338 {
339         write_lock(&fs_info->tree_mod_log_lock);
340         spin_lock(&fs_info->tree_mod_seq_lock);
341         if (!elem->seq) {
342                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
343                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
344         }
345         spin_unlock(&fs_info->tree_mod_seq_lock);
346         write_unlock(&fs_info->tree_mod_log_lock);
347
348         return elem->seq;
349 }
350
351 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
352                             struct seq_list *elem)
353 {
354         struct rb_root *tm_root;
355         struct rb_node *node;
356         struct rb_node *next;
357         struct seq_list *cur_elem;
358         struct tree_mod_elem *tm;
359         u64 min_seq = (u64)-1;
360         u64 seq_putting = elem->seq;
361
362         if (!seq_putting)
363                 return;
364
365         spin_lock(&fs_info->tree_mod_seq_lock);
366         list_del(&elem->list);
367         elem->seq = 0;
368
369         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
370                 if (cur_elem->seq < min_seq) {
371                         if (seq_putting > cur_elem->seq) {
372                                 /*
373                                  * blocker with lower sequence number exists, we
374                                  * cannot remove anything from the log
375                                  */
376                                 spin_unlock(&fs_info->tree_mod_seq_lock);
377                                 return;
378                         }
379                         min_seq = cur_elem->seq;
380                 }
381         }
382         spin_unlock(&fs_info->tree_mod_seq_lock);
383
384         /*
385          * anything that's lower than the lowest existing (read: blocked)
386          * sequence number can be removed from the tree.
387          */
388         write_lock(&fs_info->tree_mod_log_lock);
389         tm_root = &fs_info->tree_mod_log;
390         for (node = rb_first(tm_root); node; node = next) {
391                 next = rb_next(node);
392                 tm = rb_entry(node, struct tree_mod_elem, node);
393                 if (tm->seq > min_seq)
394                         continue;
395                 rb_erase(node, tm_root);
396                 kfree(tm);
397         }
398         write_unlock(&fs_info->tree_mod_log_lock);
399 }
400
401 /*
402  * key order of the log:
403  *       node/leaf start address -> sequence
404  *
405  * The 'start address' is the logical address of the *new* root node
406  * for root replace operations, or the logical address of the affected
407  * block for all other operations.
408  *
409  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
410  */
411 static noinline int
412 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
413 {
414         struct rb_root *tm_root;
415         struct rb_node **new;
416         struct rb_node *parent = NULL;
417         struct tree_mod_elem *cur;
418
419         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
420
421         tm_root = &fs_info->tree_mod_log;
422         new = &tm_root->rb_node;
423         while (*new) {
424                 cur = rb_entry(*new, struct tree_mod_elem, node);
425                 parent = *new;
426                 if (cur->logical < tm->logical)
427                         new = &((*new)->rb_left);
428                 else if (cur->logical > tm->logical)
429                         new = &((*new)->rb_right);
430                 else if (cur->seq < tm->seq)
431                         new = &((*new)->rb_left);
432                 else if (cur->seq > tm->seq)
433                         new = &((*new)->rb_right);
434                 else
435                         return -EEXIST;
436         }
437
438         rb_link_node(&tm->node, parent, new);
439         rb_insert_color(&tm->node, tm_root);
440         return 0;
441 }
442
443 /*
444  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
445  * returns zero with the tree_mod_log_lock acquired. The caller must hold
446  * this until all tree mod log insertions are recorded in the rb tree and then
447  * write unlock fs_info::tree_mod_log_lock.
448  */
449 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
450                                     struct extent_buffer *eb) {
451         smp_mb();
452         if (list_empty(&(fs_info)->tree_mod_seq_list))
453                 return 1;
454         if (eb && btrfs_header_level(eb) == 0)
455                 return 1;
456
457         write_lock(&fs_info->tree_mod_log_lock);
458         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
459                 write_unlock(&fs_info->tree_mod_log_lock);
460                 return 1;
461         }
462
463         return 0;
464 }
465
466 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
467 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
468                                     struct extent_buffer *eb)
469 {
470         smp_mb();
471         if (list_empty(&(fs_info)->tree_mod_seq_list))
472                 return 0;
473         if (eb && btrfs_header_level(eb) == 0)
474                 return 0;
475
476         return 1;
477 }
478
479 static struct tree_mod_elem *
480 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
481                     enum mod_log_op op, gfp_t flags)
482 {
483         struct tree_mod_elem *tm;
484
485         tm = kzalloc(sizeof(*tm), flags);
486         if (!tm)
487                 return NULL;
488
489         tm->logical = eb->start;
490         if (op != MOD_LOG_KEY_ADD) {
491                 btrfs_node_key(eb, &tm->key, slot);
492                 tm->blockptr = btrfs_node_blockptr(eb, slot);
493         }
494         tm->op = op;
495         tm->slot = slot;
496         tm->generation = btrfs_node_ptr_generation(eb, slot);
497         RB_CLEAR_NODE(&tm->node);
498
499         return tm;
500 }
501
502 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
503                 enum mod_log_op op, gfp_t flags)
504 {
505         struct tree_mod_elem *tm;
506         int ret;
507
508         if (!tree_mod_need_log(eb->fs_info, eb))
509                 return 0;
510
511         tm = alloc_tree_mod_elem(eb, slot, op, flags);
512         if (!tm)
513                 return -ENOMEM;
514
515         if (tree_mod_dont_log(eb->fs_info, eb)) {
516                 kfree(tm);
517                 return 0;
518         }
519
520         ret = __tree_mod_log_insert(eb->fs_info, tm);
521         write_unlock(&eb->fs_info->tree_mod_log_lock);
522         if (ret)
523                 kfree(tm);
524
525         return ret;
526 }
527
528 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
529                 int dst_slot, int src_slot, int nr_items)
530 {
531         struct tree_mod_elem *tm = NULL;
532         struct tree_mod_elem **tm_list = NULL;
533         int ret = 0;
534         int i;
535         int locked = 0;
536
537         if (!tree_mod_need_log(eb->fs_info, eb))
538                 return 0;
539
540         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
541         if (!tm_list)
542                 return -ENOMEM;
543
544         tm = kzalloc(sizeof(*tm), GFP_NOFS);
545         if (!tm) {
546                 ret = -ENOMEM;
547                 goto free_tms;
548         }
549
550         tm->logical = eb->start;
551         tm->slot = src_slot;
552         tm->move.dst_slot = dst_slot;
553         tm->move.nr_items = nr_items;
554         tm->op = MOD_LOG_MOVE_KEYS;
555
556         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
557                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
558                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
559                 if (!tm_list[i]) {
560                         ret = -ENOMEM;
561                         goto free_tms;
562                 }
563         }
564
565         if (tree_mod_dont_log(eb->fs_info, eb))
566                 goto free_tms;
567         locked = 1;
568
569         /*
570          * When we override something during the move, we log these removals.
571          * This can only happen when we move towards the beginning of the
572          * buffer, i.e. dst_slot < src_slot.
573          */
574         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
575                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
576                 if (ret)
577                         goto free_tms;
578         }
579
580         ret = __tree_mod_log_insert(eb->fs_info, tm);
581         if (ret)
582                 goto free_tms;
583         write_unlock(&eb->fs_info->tree_mod_log_lock);
584         kfree(tm_list);
585
586         return 0;
587 free_tms:
588         for (i = 0; i < nr_items; i++) {
589                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
590                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
591                 kfree(tm_list[i]);
592         }
593         if (locked)
594                 write_unlock(&eb->fs_info->tree_mod_log_lock);
595         kfree(tm_list);
596         kfree(tm);
597
598         return ret;
599 }
600
601 static inline int
602 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
603                        struct tree_mod_elem **tm_list,
604                        int nritems)
605 {
606         int i, j;
607         int ret;
608
609         for (i = nritems - 1; i >= 0; i--) {
610                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
611                 if (ret) {
612                         for (j = nritems - 1; j > i; j--)
613                                 rb_erase(&tm_list[j]->node,
614                                          &fs_info->tree_mod_log);
615                         return ret;
616                 }
617         }
618
619         return 0;
620 }
621
622 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
623                          struct extent_buffer *new_root, int log_removal)
624 {
625         struct btrfs_fs_info *fs_info = old_root->fs_info;
626         struct tree_mod_elem *tm = NULL;
627         struct tree_mod_elem **tm_list = NULL;
628         int nritems = 0;
629         int ret = 0;
630         int i;
631
632         if (!tree_mod_need_log(fs_info, NULL))
633                 return 0;
634
635         if (log_removal && btrfs_header_level(old_root) > 0) {
636                 nritems = btrfs_header_nritems(old_root);
637                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
638                                   GFP_NOFS);
639                 if (!tm_list) {
640                         ret = -ENOMEM;
641                         goto free_tms;
642                 }
643                 for (i = 0; i < nritems; i++) {
644                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
645                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
646                         if (!tm_list[i]) {
647                                 ret = -ENOMEM;
648                                 goto free_tms;
649                         }
650                 }
651         }
652
653         tm = kzalloc(sizeof(*tm), GFP_NOFS);
654         if (!tm) {
655                 ret = -ENOMEM;
656                 goto free_tms;
657         }
658
659         tm->logical = new_root->start;
660         tm->old_root.logical = old_root->start;
661         tm->old_root.level = btrfs_header_level(old_root);
662         tm->generation = btrfs_header_generation(old_root);
663         tm->op = MOD_LOG_ROOT_REPLACE;
664
665         if (tree_mod_dont_log(fs_info, NULL))
666                 goto free_tms;
667
668         if (tm_list)
669                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
670         if (!ret)
671                 ret = __tree_mod_log_insert(fs_info, tm);
672
673         write_unlock(&fs_info->tree_mod_log_lock);
674         if (ret)
675                 goto free_tms;
676         kfree(tm_list);
677
678         return ret;
679
680 free_tms:
681         if (tm_list) {
682                 for (i = 0; i < nritems; i++)
683                         kfree(tm_list[i]);
684                 kfree(tm_list);
685         }
686         kfree(tm);
687
688         return ret;
689 }
690
691 static struct tree_mod_elem *
692 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
693                       int smallest)
694 {
695         struct rb_root *tm_root;
696         struct rb_node *node;
697         struct tree_mod_elem *cur = NULL;
698         struct tree_mod_elem *found = NULL;
699
700         read_lock(&fs_info->tree_mod_log_lock);
701         tm_root = &fs_info->tree_mod_log;
702         node = tm_root->rb_node;
703         while (node) {
704                 cur = rb_entry(node, struct tree_mod_elem, node);
705                 if (cur->logical < start) {
706                         node = node->rb_left;
707                 } else if (cur->logical > start) {
708                         node = node->rb_right;
709                 } else if (cur->seq < min_seq) {
710                         node = node->rb_left;
711                 } else if (!smallest) {
712                         /* we want the node with the highest seq */
713                         if (found)
714                                 BUG_ON(found->seq > cur->seq);
715                         found = cur;
716                         node = node->rb_left;
717                 } else if (cur->seq > min_seq) {
718                         /* we want the node with the smallest seq */
719                         if (found)
720                                 BUG_ON(found->seq < cur->seq);
721                         found = cur;
722                         node = node->rb_right;
723                 } else {
724                         found = cur;
725                         break;
726                 }
727         }
728         read_unlock(&fs_info->tree_mod_log_lock);
729
730         return found;
731 }
732
733 /*
734  * this returns the element from the log with the smallest time sequence
735  * value that's in the log (the oldest log item). any element with a time
736  * sequence lower than min_seq will be ignored.
737  */
738 static struct tree_mod_elem *
739 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
740                            u64 min_seq)
741 {
742         return __tree_mod_log_search(fs_info, start, min_seq, 1);
743 }
744
745 /*
746  * this returns the element from the log with the largest time sequence
747  * value that's in the log (the most recent log item). any element with
748  * a time sequence lower than min_seq will be ignored.
749  */
750 static struct tree_mod_elem *
751 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
752 {
753         return __tree_mod_log_search(fs_info, start, min_seq, 0);
754 }
755
756 static noinline int
757 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
758                      struct extent_buffer *src, unsigned long dst_offset,
759                      unsigned long src_offset, int nr_items)
760 {
761         int ret = 0;
762         struct tree_mod_elem **tm_list = NULL;
763         struct tree_mod_elem **tm_list_add, **tm_list_rem;
764         int i;
765         int locked = 0;
766
767         if (!tree_mod_need_log(fs_info, NULL))
768                 return 0;
769
770         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
771                 return 0;
772
773         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
774                           GFP_NOFS);
775         if (!tm_list)
776                 return -ENOMEM;
777
778         tm_list_add = tm_list;
779         tm_list_rem = tm_list + nr_items;
780         for (i = 0; i < nr_items; i++) {
781                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
782                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
783                 if (!tm_list_rem[i]) {
784                         ret = -ENOMEM;
785                         goto free_tms;
786                 }
787
788                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
789                     MOD_LOG_KEY_ADD, GFP_NOFS);
790                 if (!tm_list_add[i]) {
791                         ret = -ENOMEM;
792                         goto free_tms;
793                 }
794         }
795
796         if (tree_mod_dont_log(fs_info, NULL))
797                 goto free_tms;
798         locked = 1;
799
800         for (i = 0; i < nr_items; i++) {
801                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
802                 if (ret)
803                         goto free_tms;
804                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
805                 if (ret)
806                         goto free_tms;
807         }
808
809         write_unlock(&fs_info->tree_mod_log_lock);
810         kfree(tm_list);
811
812         return 0;
813
814 free_tms:
815         for (i = 0; i < nr_items * 2; i++) {
816                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
817                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
818                 kfree(tm_list[i]);
819         }
820         if (locked)
821                 write_unlock(&fs_info->tree_mod_log_lock);
822         kfree(tm_list);
823
824         return ret;
825 }
826
827 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
828 {
829         struct tree_mod_elem **tm_list = NULL;
830         int nritems = 0;
831         int i;
832         int ret = 0;
833
834         if (btrfs_header_level(eb) == 0)
835                 return 0;
836
837         if (!tree_mod_need_log(eb->fs_info, NULL))
838                 return 0;
839
840         nritems = btrfs_header_nritems(eb);
841         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
842         if (!tm_list)
843                 return -ENOMEM;
844
845         for (i = 0; i < nritems; i++) {
846                 tm_list[i] = alloc_tree_mod_elem(eb, i,
847                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
848                 if (!tm_list[i]) {
849                         ret = -ENOMEM;
850                         goto free_tms;
851                 }
852         }
853
854         if (tree_mod_dont_log(eb->fs_info, eb))
855                 goto free_tms;
856
857         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
858         write_unlock(&eb->fs_info->tree_mod_log_lock);
859         if (ret)
860                 goto free_tms;
861         kfree(tm_list);
862
863         return 0;
864
865 free_tms:
866         for (i = 0; i < nritems; i++)
867                 kfree(tm_list[i]);
868         kfree(tm_list);
869
870         return ret;
871 }
872
873 /*
874  * check if the tree block can be shared by multiple trees
875  */
876 int btrfs_block_can_be_shared(struct btrfs_root *root,
877                               struct extent_buffer *buf)
878 {
879         /*
880          * Tree blocks not in reference counted trees and tree roots
881          * are never shared. If a block was allocated after the last
882          * snapshot and the block was not allocated by tree relocation,
883          * we know the block is not shared.
884          */
885         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
886             buf != root->node && buf != root->commit_root &&
887             (btrfs_header_generation(buf) <=
888              btrfs_root_last_snapshot(&root->root_item) ||
889              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
890                 return 1;
891 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
892         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
893             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
894                 return 1;
895 #endif
896         return 0;
897 }
898
899 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
900                                        struct btrfs_root *root,
901                                        struct extent_buffer *buf,
902                                        struct extent_buffer *cow,
903                                        int *last_ref)
904 {
905         struct btrfs_fs_info *fs_info = root->fs_info;
906         u64 refs;
907         u64 owner;
908         u64 flags;
909         u64 new_flags = 0;
910         int ret;
911
912         /*
913          * Backrefs update rules:
914          *
915          * Always use full backrefs for extent pointers in tree block
916          * allocated by tree relocation.
917          *
918          * If a shared tree block is no longer referenced by its owner
919          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
920          * use full backrefs for extent pointers in tree block.
921          *
922          * If a tree block is been relocating
923          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
924          * use full backrefs for extent pointers in tree block.
925          * The reason for this is some operations (such as drop tree)
926          * are only allowed for blocks use full backrefs.
927          */
928
929         if (btrfs_block_can_be_shared(root, buf)) {
930                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
931                                                btrfs_header_level(buf), 1,
932                                                &refs, &flags);
933                 if (ret)
934                         return ret;
935                 if (refs == 0) {
936                         ret = -EROFS;
937                         btrfs_handle_fs_error(fs_info, ret, NULL);
938                         return ret;
939                 }
940         } else {
941                 refs = 1;
942                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
943                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
944                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
945                 else
946                         flags = 0;
947         }
948
949         owner = btrfs_header_owner(buf);
950         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
951                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
952
953         if (refs > 1) {
954                 if ((owner == root->root_key.objectid ||
955                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
956                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
957                         ret = btrfs_inc_ref(trans, root, buf, 1);
958                         if (ret)
959                                 return ret;
960
961                         if (root->root_key.objectid ==
962                             BTRFS_TREE_RELOC_OBJECTID) {
963                                 ret = btrfs_dec_ref(trans, root, buf, 0);
964                                 if (ret)
965                                         return ret;
966                                 ret = btrfs_inc_ref(trans, root, cow, 1);
967                                 if (ret)
968                                         return ret;
969                         }
970                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
971                 } else {
972
973                         if (root->root_key.objectid ==
974                             BTRFS_TREE_RELOC_OBJECTID)
975                                 ret = btrfs_inc_ref(trans, root, cow, 1);
976                         else
977                                 ret = btrfs_inc_ref(trans, root, cow, 0);
978                         if (ret)
979                                 return ret;
980                 }
981                 if (new_flags != 0) {
982                         int level = btrfs_header_level(buf);
983
984                         ret = btrfs_set_disk_extent_flags(trans, fs_info,
985                                                           buf->start,
986                                                           buf->len,
987                                                           new_flags, level, 0);
988                         if (ret)
989                                 return ret;
990                 }
991         } else {
992                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
993                         if (root->root_key.objectid ==
994                             BTRFS_TREE_RELOC_OBJECTID)
995                                 ret = btrfs_inc_ref(trans, root, cow, 1);
996                         else
997                                 ret = btrfs_inc_ref(trans, root, cow, 0);
998                         if (ret)
999                                 return ret;
1000                         ret = btrfs_dec_ref(trans, root, buf, 1);
1001                         if (ret)
1002                                 return ret;
1003                 }
1004                 clean_tree_block(fs_info, buf);
1005                 *last_ref = 1;
1006         }
1007         return 0;
1008 }
1009
1010 /*
1011  * does the dirty work in cow of a single block.  The parent block (if
1012  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1013  * dirty and returned locked.  If you modify the block it needs to be marked
1014  * dirty again.
1015  *
1016  * search_start -- an allocation hint for the new block
1017  *
1018  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1019  * bytes the allocator should try to find free next to the block it returns.
1020  * This is just a hint and may be ignored by the allocator.
1021  */
1022 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1023                              struct btrfs_root *root,
1024                              struct extent_buffer *buf,
1025                              struct extent_buffer *parent, int parent_slot,
1026                              struct extent_buffer **cow_ret,
1027                              u64 search_start, u64 empty_size)
1028 {
1029         struct btrfs_fs_info *fs_info = root->fs_info;
1030         struct btrfs_disk_key disk_key;
1031         struct extent_buffer *cow;
1032         int level, ret;
1033         int last_ref = 0;
1034         int unlock_orig = 0;
1035         u64 parent_start = 0;
1036
1037         if (*cow_ret == buf)
1038                 unlock_orig = 1;
1039
1040         btrfs_assert_tree_locked(buf);
1041
1042         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1043                 trans->transid != fs_info->running_transaction->transid);
1044         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1045                 trans->transid != root->last_trans);
1046
1047         level = btrfs_header_level(buf);
1048
1049         if (level == 0)
1050                 btrfs_item_key(buf, &disk_key, 0);
1051         else
1052                 btrfs_node_key(buf, &disk_key, 0);
1053
1054         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1055                 parent_start = parent->start;
1056
1057         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1058                         root->root_key.objectid, &disk_key, level,
1059                         search_start, empty_size);
1060         if (IS_ERR(cow))
1061                 return PTR_ERR(cow);
1062
1063         /* cow is set to blocking by btrfs_init_new_buffer */
1064
1065         copy_extent_buffer_full(cow, buf);
1066         btrfs_set_header_bytenr(cow, cow->start);
1067         btrfs_set_header_generation(cow, trans->transid);
1068         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1069         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1070                                      BTRFS_HEADER_FLAG_RELOC);
1071         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1072                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1073         else
1074                 btrfs_set_header_owner(cow, root->root_key.objectid);
1075
1076         write_extent_buffer_fsid(cow, fs_info->fsid);
1077
1078         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1079         if (ret) {
1080                 btrfs_abort_transaction(trans, ret);
1081                 return ret;
1082         }
1083
1084         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1085                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1086                 if (ret) {
1087                         btrfs_abort_transaction(trans, ret);
1088                         return ret;
1089                 }
1090         }
1091
1092         if (buf == root->node) {
1093                 WARN_ON(parent && parent != buf);
1094                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1095                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1096                         parent_start = buf->start;
1097
1098                 extent_buffer_get(cow);
1099                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1100                 BUG_ON(ret < 0);
1101                 rcu_assign_pointer(root->node, cow);
1102
1103                 btrfs_free_tree_block(trans, root, buf, parent_start,
1104                                       last_ref);
1105                 free_extent_buffer(buf);
1106                 add_root_to_dirty_list(root);
1107         } else {
1108                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1109                 tree_mod_log_insert_key(parent, parent_slot,
1110                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1111                 btrfs_set_node_blockptr(parent, parent_slot,
1112                                         cow->start);
1113                 btrfs_set_node_ptr_generation(parent, parent_slot,
1114                                               trans->transid);
1115                 btrfs_mark_buffer_dirty(parent);
1116                 if (last_ref) {
1117                         ret = tree_mod_log_free_eb(buf);
1118                         if (ret) {
1119                                 btrfs_abort_transaction(trans, ret);
1120                                 return ret;
1121                         }
1122                 }
1123                 btrfs_free_tree_block(trans, root, buf, parent_start,
1124                                       last_ref);
1125         }
1126         if (unlock_orig)
1127                 btrfs_tree_unlock(buf);
1128         free_extent_buffer_stale(buf);
1129         btrfs_mark_buffer_dirty(cow);
1130         *cow_ret = cow;
1131         return 0;
1132 }
1133
1134 /*
1135  * returns the logical address of the oldest predecessor of the given root.
1136  * entries older than time_seq are ignored.
1137  */
1138 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1139                 struct extent_buffer *eb_root, u64 time_seq)
1140 {
1141         struct tree_mod_elem *tm;
1142         struct tree_mod_elem *found = NULL;
1143         u64 root_logical = eb_root->start;
1144         int looped = 0;
1145
1146         if (!time_seq)
1147                 return NULL;
1148
1149         /*
1150          * the very last operation that's logged for a root is the
1151          * replacement operation (if it is replaced at all). this has
1152          * the logical address of the *new* root, making it the very
1153          * first operation that's logged for this root.
1154          */
1155         while (1) {
1156                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1157                                                 time_seq);
1158                 if (!looped && !tm)
1159                         return NULL;
1160                 /*
1161                  * if there are no tree operation for the oldest root, we simply
1162                  * return it. this should only happen if that (old) root is at
1163                  * level 0.
1164                  */
1165                 if (!tm)
1166                         break;
1167
1168                 /*
1169                  * if there's an operation that's not a root replacement, we
1170                  * found the oldest version of our root. normally, we'll find a
1171                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1172                  */
1173                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1174                         break;
1175
1176                 found = tm;
1177                 root_logical = tm->old_root.logical;
1178                 looped = 1;
1179         }
1180
1181         /* if there's no old root to return, return what we found instead */
1182         if (!found)
1183                 found = tm;
1184
1185         return found;
1186 }
1187
1188 /*
1189  * tm is a pointer to the first operation to rewind within eb. then, all
1190  * previous operations will be rewound (until we reach something older than
1191  * time_seq).
1192  */
1193 static void
1194 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1195                       u64 time_seq, struct tree_mod_elem *first_tm)
1196 {
1197         u32 n;
1198         struct rb_node *next;
1199         struct tree_mod_elem *tm = first_tm;
1200         unsigned long o_dst;
1201         unsigned long o_src;
1202         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1203
1204         n = btrfs_header_nritems(eb);
1205         read_lock(&fs_info->tree_mod_log_lock);
1206         while (tm && tm->seq >= time_seq) {
1207                 /*
1208                  * all the operations are recorded with the operator used for
1209                  * the modification. as we're going backwards, we do the
1210                  * opposite of each operation here.
1211                  */
1212                 switch (tm->op) {
1213                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1214                         BUG_ON(tm->slot < n);
1215                         /* Fallthrough */
1216                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1217                 case MOD_LOG_KEY_REMOVE:
1218                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1219                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1220                         btrfs_set_node_ptr_generation(eb, tm->slot,
1221                                                       tm->generation);
1222                         n++;
1223                         break;
1224                 case MOD_LOG_KEY_REPLACE:
1225                         BUG_ON(tm->slot >= n);
1226                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1227                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1228                         btrfs_set_node_ptr_generation(eb, tm->slot,
1229                                                       tm->generation);
1230                         break;
1231                 case MOD_LOG_KEY_ADD:
1232                         /* if a move operation is needed it's in the log */
1233                         n--;
1234                         break;
1235                 case MOD_LOG_MOVE_KEYS:
1236                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1237                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1238                         memmove_extent_buffer(eb, o_dst, o_src,
1239                                               tm->move.nr_items * p_size);
1240                         break;
1241                 case MOD_LOG_ROOT_REPLACE:
1242                         /*
1243                          * this operation is special. for roots, this must be
1244                          * handled explicitly before rewinding.
1245                          * for non-roots, this operation may exist if the node
1246                          * was a root: root A -> child B; then A gets empty and
1247                          * B is promoted to the new root. in the mod log, we'll
1248                          * have a root-replace operation for B, a tree block
1249                          * that is no root. we simply ignore that operation.
1250                          */
1251                         break;
1252                 }
1253                 next = rb_next(&tm->node);
1254                 if (!next)
1255                         break;
1256                 tm = rb_entry(next, struct tree_mod_elem, node);
1257                 if (tm->logical != first_tm->logical)
1258                         break;
1259         }
1260         read_unlock(&fs_info->tree_mod_log_lock);
1261         btrfs_set_header_nritems(eb, n);
1262 }
1263
1264 /*
1265  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1266  * is returned. If rewind operations happen, a fresh buffer is returned. The
1267  * returned buffer is always read-locked. If the returned buffer is not the
1268  * input buffer, the lock on the input buffer is released and the input buffer
1269  * is freed (its refcount is decremented).
1270  */
1271 static struct extent_buffer *
1272 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1273                     struct extent_buffer *eb, u64 time_seq)
1274 {
1275         struct extent_buffer *eb_rewin;
1276         struct tree_mod_elem *tm;
1277
1278         if (!time_seq)
1279                 return eb;
1280
1281         if (btrfs_header_level(eb) == 0)
1282                 return eb;
1283
1284         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1285         if (!tm)
1286                 return eb;
1287
1288         btrfs_set_path_blocking(path);
1289         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1290
1291         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1292                 BUG_ON(tm->slot != 0);
1293                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1294                 if (!eb_rewin) {
1295                         btrfs_tree_read_unlock_blocking(eb);
1296                         free_extent_buffer(eb);
1297                         return NULL;
1298                 }
1299                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1300                 btrfs_set_header_backref_rev(eb_rewin,
1301                                              btrfs_header_backref_rev(eb));
1302                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1303                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1304         } else {
1305                 eb_rewin = btrfs_clone_extent_buffer(eb);
1306                 if (!eb_rewin) {
1307                         btrfs_tree_read_unlock_blocking(eb);
1308                         free_extent_buffer(eb);
1309                         return NULL;
1310                 }
1311         }
1312
1313         btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1314         btrfs_tree_read_unlock_blocking(eb);
1315         free_extent_buffer(eb);
1316
1317         extent_buffer_get(eb_rewin);
1318         btrfs_tree_read_lock(eb_rewin);
1319         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1320         WARN_ON(btrfs_header_nritems(eb_rewin) >
1321                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1322
1323         return eb_rewin;
1324 }
1325
1326 /*
1327  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1328  * value. If there are no changes, the current root->root_node is returned. If
1329  * anything changed in between, there's a fresh buffer allocated on which the
1330  * rewind operations are done. In any case, the returned buffer is read locked.
1331  * Returns NULL on error (with no locks held).
1332  */
1333 static inline struct extent_buffer *
1334 get_old_root(struct btrfs_root *root, u64 time_seq)
1335 {
1336         struct btrfs_fs_info *fs_info = root->fs_info;
1337         struct tree_mod_elem *tm;
1338         struct extent_buffer *eb = NULL;
1339         struct extent_buffer *eb_root;
1340         struct extent_buffer *old;
1341         struct tree_mod_root *old_root = NULL;
1342         u64 old_generation = 0;
1343         u64 logical;
1344         int level;
1345
1346         eb_root = btrfs_read_lock_root_node(root);
1347         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1348         if (!tm)
1349                 return eb_root;
1350
1351         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1352                 old_root = &tm->old_root;
1353                 old_generation = tm->generation;
1354                 logical = old_root->logical;
1355                 level = old_root->level;
1356         } else {
1357                 logical = eb_root->start;
1358                 level = btrfs_header_level(eb_root);
1359         }
1360
1361         tm = tree_mod_log_search(fs_info, logical, time_seq);
1362         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1363                 btrfs_tree_read_unlock(eb_root);
1364                 free_extent_buffer(eb_root);
1365                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1366                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1367                         if (!IS_ERR(old))
1368                                 free_extent_buffer(old);
1369                         btrfs_warn(fs_info,
1370                                    "failed to read tree block %llu from get_old_root",
1371                                    logical);
1372                 } else {
1373                         eb = btrfs_clone_extent_buffer(old);
1374                         free_extent_buffer(old);
1375                 }
1376         } else if (old_root) {
1377                 btrfs_tree_read_unlock(eb_root);
1378                 free_extent_buffer(eb_root);
1379                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1380         } else {
1381                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1382                 eb = btrfs_clone_extent_buffer(eb_root);
1383                 btrfs_tree_read_unlock_blocking(eb_root);
1384                 free_extent_buffer(eb_root);
1385         }
1386
1387         if (!eb)
1388                 return NULL;
1389         extent_buffer_get(eb);
1390         btrfs_tree_read_lock(eb);
1391         if (old_root) {
1392                 btrfs_set_header_bytenr(eb, eb->start);
1393                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1394                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1395                 btrfs_set_header_level(eb, old_root->level);
1396                 btrfs_set_header_generation(eb, old_generation);
1397         }
1398         if (tm)
1399                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1400         else
1401                 WARN_ON(btrfs_header_level(eb) != 0);
1402         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1403
1404         return eb;
1405 }
1406
1407 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1408 {
1409         struct tree_mod_elem *tm;
1410         int level;
1411         struct extent_buffer *eb_root = btrfs_root_node(root);
1412
1413         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1414         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1415                 level = tm->old_root.level;
1416         } else {
1417                 level = btrfs_header_level(eb_root);
1418         }
1419         free_extent_buffer(eb_root);
1420
1421         return level;
1422 }
1423
1424 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1425                                    struct btrfs_root *root,
1426                                    struct extent_buffer *buf)
1427 {
1428         if (btrfs_is_testing(root->fs_info))
1429                 return 0;
1430
1431         /* Ensure we can see the FORCE_COW bit */
1432         smp_mb__before_atomic();
1433
1434         /*
1435          * We do not need to cow a block if
1436          * 1) this block is not created or changed in this transaction;
1437          * 2) this block does not belong to TREE_RELOC tree;
1438          * 3) the root is not forced COW.
1439          *
1440          * What is forced COW:
1441          *    when we create snapshot during committing the transaction,
1442          *    after we've finished coping src root, we must COW the shared
1443          *    block to ensure the metadata consistency.
1444          */
1445         if (btrfs_header_generation(buf) == trans->transid &&
1446             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1447             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1448               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1449             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1450                 return 0;
1451         return 1;
1452 }
1453
1454 /*
1455  * cows a single block, see __btrfs_cow_block for the real work.
1456  * This version of it has extra checks so that a block isn't COWed more than
1457  * once per transaction, as long as it hasn't been written yet
1458  */
1459 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1460                     struct btrfs_root *root, struct extent_buffer *buf,
1461                     struct extent_buffer *parent, int parent_slot,
1462                     struct extent_buffer **cow_ret)
1463 {
1464         struct btrfs_fs_info *fs_info = root->fs_info;
1465         u64 search_start;
1466         int ret;
1467
1468         if (trans->transaction != fs_info->running_transaction)
1469                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1470                        trans->transid,
1471                        fs_info->running_transaction->transid);
1472
1473         if (trans->transid != fs_info->generation)
1474                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1475                        trans->transid, fs_info->generation);
1476
1477         if (!should_cow_block(trans, root, buf)) {
1478                 trans->dirty = true;
1479                 *cow_ret = buf;
1480                 return 0;
1481         }
1482
1483         search_start = buf->start & ~((u64)SZ_1G - 1);
1484
1485         if (parent)
1486                 btrfs_set_lock_blocking(parent);
1487         btrfs_set_lock_blocking(buf);
1488
1489         ret = __btrfs_cow_block(trans, root, buf, parent,
1490                                  parent_slot, cow_ret, search_start, 0);
1491
1492         trace_btrfs_cow_block(root, buf, *cow_ret);
1493
1494         return ret;
1495 }
1496
1497 /*
1498  * helper function for defrag to decide if two blocks pointed to by a
1499  * node are actually close by
1500  */
1501 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1502 {
1503         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1504                 return 1;
1505         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1506                 return 1;
1507         return 0;
1508 }
1509
1510 /*
1511  * compare two keys in a memcmp fashion
1512  */
1513 static int comp_keys(const struct btrfs_disk_key *disk,
1514                      const struct btrfs_key *k2)
1515 {
1516         struct btrfs_key k1;
1517
1518         btrfs_disk_key_to_cpu(&k1, disk);
1519
1520         return btrfs_comp_cpu_keys(&k1, k2);
1521 }
1522
1523 /*
1524  * same as comp_keys only with two btrfs_key's
1525  */
1526 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1527 {
1528         if (k1->objectid > k2->objectid)
1529                 return 1;
1530         if (k1->objectid < k2->objectid)
1531                 return -1;
1532         if (k1->type > k2->type)
1533                 return 1;
1534         if (k1->type < k2->type)
1535                 return -1;
1536         if (k1->offset > k2->offset)
1537                 return 1;
1538         if (k1->offset < k2->offset)
1539                 return -1;
1540         return 0;
1541 }
1542
1543 /*
1544  * this is used by the defrag code to go through all the
1545  * leaves pointed to by a node and reallocate them so that
1546  * disk order is close to key order
1547  */
1548 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1549                        struct btrfs_root *root, struct extent_buffer *parent,
1550                        int start_slot, u64 *last_ret,
1551                        struct btrfs_key *progress)
1552 {
1553         struct btrfs_fs_info *fs_info = root->fs_info;
1554         struct extent_buffer *cur;
1555         u64 blocknr;
1556         u64 gen;
1557         u64 search_start = *last_ret;
1558         u64 last_block = 0;
1559         u64 other;
1560         u32 parent_nritems;
1561         int end_slot;
1562         int i;
1563         int err = 0;
1564         int parent_level;
1565         int uptodate;
1566         u32 blocksize;
1567         int progress_passed = 0;
1568         struct btrfs_disk_key disk_key;
1569
1570         parent_level = btrfs_header_level(parent);
1571
1572         WARN_ON(trans->transaction != fs_info->running_transaction);
1573         WARN_ON(trans->transid != fs_info->generation);
1574
1575         parent_nritems = btrfs_header_nritems(parent);
1576         blocksize = fs_info->nodesize;
1577         end_slot = parent_nritems - 1;
1578
1579         if (parent_nritems <= 1)
1580                 return 0;
1581
1582         btrfs_set_lock_blocking(parent);
1583
1584         for (i = start_slot; i <= end_slot; i++) {
1585                 struct btrfs_key first_key;
1586                 int close = 1;
1587
1588                 btrfs_node_key(parent, &disk_key, i);
1589                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1590                         continue;
1591
1592                 progress_passed = 1;
1593                 blocknr = btrfs_node_blockptr(parent, i);
1594                 gen = btrfs_node_ptr_generation(parent, i);
1595                 btrfs_node_key_to_cpu(parent, &first_key, i);
1596                 if (last_block == 0)
1597                         last_block = blocknr;
1598
1599                 if (i > 0) {
1600                         other = btrfs_node_blockptr(parent, i - 1);
1601                         close = close_blocks(blocknr, other, blocksize);
1602                 }
1603                 if (!close && i < end_slot) {
1604                         other = btrfs_node_blockptr(parent, i + 1);
1605                         close = close_blocks(blocknr, other, blocksize);
1606                 }
1607                 if (close) {
1608                         last_block = blocknr;
1609                         continue;
1610                 }
1611
1612                 cur = find_extent_buffer(fs_info, blocknr);
1613                 if (cur)
1614                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1615                 else
1616                         uptodate = 0;
1617                 if (!cur || !uptodate) {
1618                         if (!cur) {
1619                                 cur = read_tree_block(fs_info, blocknr, gen,
1620                                                       parent_level - 1,
1621                                                       &first_key);
1622                                 if (IS_ERR(cur)) {
1623                                         return PTR_ERR(cur);
1624                                 } else if (!extent_buffer_uptodate(cur)) {
1625                                         free_extent_buffer(cur);
1626                                         return -EIO;
1627                                 }
1628                         } else if (!uptodate) {
1629                                 err = btrfs_read_buffer(cur, gen,
1630                                                 parent_level - 1,&first_key);
1631                                 if (err) {
1632                                         free_extent_buffer(cur);
1633                                         return err;
1634                                 }
1635                         }
1636                 }
1637                 if (search_start == 0)
1638                         search_start = last_block;
1639
1640                 btrfs_tree_lock(cur);
1641                 btrfs_set_lock_blocking(cur);
1642                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1643                                         &cur, search_start,
1644                                         min(16 * blocksize,
1645                                             (end_slot - i) * blocksize));
1646                 if (err) {
1647                         btrfs_tree_unlock(cur);
1648                         free_extent_buffer(cur);
1649                         break;
1650                 }
1651                 search_start = cur->start;
1652                 last_block = cur->start;
1653                 *last_ret = search_start;
1654                 btrfs_tree_unlock(cur);
1655                 free_extent_buffer(cur);
1656         }
1657         return err;
1658 }
1659
1660 /*
1661  * search for key in the extent_buffer.  The items start at offset p,
1662  * and they are item_size apart.  There are 'max' items in p.
1663  *
1664  * the slot in the array is returned via slot, and it points to
1665  * the place where you would insert key if it is not found in
1666  * the array.
1667  *
1668  * slot may point to max if the key is bigger than all of the keys
1669  */
1670 static noinline int generic_bin_search(struct extent_buffer *eb,
1671                                        unsigned long p, int item_size,
1672                                        const struct btrfs_key *key,
1673                                        int max, int *slot)
1674 {
1675         int low = 0;
1676         int high = max;
1677         int mid;
1678         int ret;
1679         struct btrfs_disk_key *tmp = NULL;
1680         struct btrfs_disk_key unaligned;
1681         unsigned long offset;
1682         char *kaddr = NULL;
1683         unsigned long map_start = 0;
1684         unsigned long map_len = 0;
1685         int err;
1686
1687         if (low > high) {
1688                 btrfs_err(eb->fs_info,
1689                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1690                           __func__, low, high, eb->start,
1691                           btrfs_header_owner(eb), btrfs_header_level(eb));
1692                 return -EINVAL;
1693         }
1694
1695         while (low < high) {
1696                 mid = (low + high) / 2;
1697                 offset = p + mid * item_size;
1698
1699                 if (!kaddr || offset < map_start ||
1700                     (offset + sizeof(struct btrfs_disk_key)) >
1701                     map_start + map_len) {
1702
1703                         err = map_private_extent_buffer(eb, offset,
1704                                                 sizeof(struct btrfs_disk_key),
1705                                                 &kaddr, &map_start, &map_len);
1706
1707                         if (!err) {
1708                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1709                                                         map_start);
1710                         } else if (err == 1) {
1711                                 read_extent_buffer(eb, &unaligned,
1712                                                    offset, sizeof(unaligned));
1713                                 tmp = &unaligned;
1714                         } else {
1715                                 return err;
1716                         }
1717
1718                 } else {
1719                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1720                                                         map_start);
1721                 }
1722                 ret = comp_keys(tmp, key);
1723
1724                 if (ret < 0)
1725                         low = mid + 1;
1726                 else if (ret > 0)
1727                         high = mid;
1728                 else {
1729                         *slot = mid;
1730                         return 0;
1731                 }
1732         }
1733         *slot = low;
1734         return 1;
1735 }
1736
1737 /*
1738  * simple bin_search frontend that does the right thing for
1739  * leaves vs nodes
1740  */
1741 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1742                      int level, int *slot)
1743 {
1744         if (level == 0)
1745                 return generic_bin_search(eb,
1746                                           offsetof(struct btrfs_leaf, items),
1747                                           sizeof(struct btrfs_item),
1748                                           key, btrfs_header_nritems(eb),
1749                                           slot);
1750         else
1751                 return generic_bin_search(eb,
1752                                           offsetof(struct btrfs_node, ptrs),
1753                                           sizeof(struct btrfs_key_ptr),
1754                                           key, btrfs_header_nritems(eb),
1755                                           slot);
1756 }
1757
1758 static void root_add_used(struct btrfs_root *root, u32 size)
1759 {
1760         spin_lock(&root->accounting_lock);
1761         btrfs_set_root_used(&root->root_item,
1762                             btrfs_root_used(&root->root_item) + size);
1763         spin_unlock(&root->accounting_lock);
1764 }
1765
1766 static void root_sub_used(struct btrfs_root *root, u32 size)
1767 {
1768         spin_lock(&root->accounting_lock);
1769         btrfs_set_root_used(&root->root_item,
1770                             btrfs_root_used(&root->root_item) - size);
1771         spin_unlock(&root->accounting_lock);
1772 }
1773
1774 /* given a node and slot number, this reads the blocks it points to.  The
1775  * extent buffer is returned with a reference taken (but unlocked).
1776  */
1777 static noinline struct extent_buffer *
1778 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1779                int slot)
1780 {
1781         int level = btrfs_header_level(parent);
1782         struct extent_buffer *eb;
1783         struct btrfs_key first_key;
1784
1785         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1786                 return ERR_PTR(-ENOENT);
1787
1788         BUG_ON(level == 0);
1789
1790         btrfs_node_key_to_cpu(parent, &first_key, slot);
1791         eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1792                              btrfs_node_ptr_generation(parent, slot),
1793                              level - 1, &first_key);
1794         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1795                 free_extent_buffer(eb);
1796                 eb = ERR_PTR(-EIO);
1797         }
1798
1799         return eb;
1800 }
1801
1802 /*
1803  * node level balancing, used to make sure nodes are in proper order for
1804  * item deletion.  We balance from the top down, so we have to make sure
1805  * that a deletion won't leave an node completely empty later on.
1806  */
1807 static noinline int balance_level(struct btrfs_trans_handle *trans,
1808                          struct btrfs_root *root,
1809                          struct btrfs_path *path, int level)
1810 {
1811         struct btrfs_fs_info *fs_info = root->fs_info;
1812         struct extent_buffer *right = NULL;
1813         struct extent_buffer *mid;
1814         struct extent_buffer *left = NULL;
1815         struct extent_buffer *parent = NULL;
1816         int ret = 0;
1817         int wret;
1818         int pslot;
1819         int orig_slot = path->slots[level];
1820         u64 orig_ptr;
1821
1822         if (level == 0)
1823                 return 0;
1824
1825         mid = path->nodes[level];
1826
1827         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1828                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1829         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1830
1831         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1832
1833         if (level < BTRFS_MAX_LEVEL - 1) {
1834                 parent = path->nodes[level + 1];
1835                 pslot = path->slots[level + 1];
1836         }
1837
1838         /*
1839          * deal with the case where there is only one pointer in the root
1840          * by promoting the node below to a root
1841          */
1842         if (!parent) {
1843                 struct extent_buffer *child;
1844
1845                 if (btrfs_header_nritems(mid) != 1)
1846                         return 0;
1847
1848                 /* promote the child to a root */
1849                 child = read_node_slot(fs_info, mid, 0);
1850                 if (IS_ERR(child)) {
1851                         ret = PTR_ERR(child);
1852                         btrfs_handle_fs_error(fs_info, ret, NULL);
1853                         goto enospc;
1854                 }
1855
1856                 btrfs_tree_lock(child);
1857                 btrfs_set_lock_blocking(child);
1858                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1859                 if (ret) {
1860                         btrfs_tree_unlock(child);
1861                         free_extent_buffer(child);
1862                         goto enospc;
1863                 }
1864
1865                 ret = tree_mod_log_insert_root(root->node, child, 1);
1866                 BUG_ON(ret < 0);
1867                 rcu_assign_pointer(root->node, child);
1868
1869                 add_root_to_dirty_list(root);
1870                 btrfs_tree_unlock(child);
1871
1872                 path->locks[level] = 0;
1873                 path->nodes[level] = NULL;
1874                 clean_tree_block(fs_info, mid);
1875                 btrfs_tree_unlock(mid);
1876                 /* once for the path */
1877                 free_extent_buffer(mid);
1878
1879                 root_sub_used(root, mid->len);
1880                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1881                 /* once for the root ptr */
1882                 free_extent_buffer_stale(mid);
1883                 return 0;
1884         }
1885         if (btrfs_header_nritems(mid) >
1886             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1887                 return 0;
1888
1889         left = read_node_slot(fs_info, parent, pslot - 1);
1890         if (IS_ERR(left))
1891                 left = NULL;
1892
1893         if (left) {
1894                 btrfs_tree_lock(left);
1895                 btrfs_set_lock_blocking(left);
1896                 wret = btrfs_cow_block(trans, root, left,
1897                                        parent, pslot - 1, &left);
1898                 if (wret) {
1899                         ret = wret;
1900                         goto enospc;
1901                 }
1902         }
1903
1904         right = read_node_slot(fs_info, parent, pslot + 1);
1905         if (IS_ERR(right))
1906                 right = NULL;
1907
1908         if (right) {
1909                 btrfs_tree_lock(right);
1910                 btrfs_set_lock_blocking(right);
1911                 wret = btrfs_cow_block(trans, root, right,
1912                                        parent, pslot + 1, &right);
1913                 if (wret) {
1914                         ret = wret;
1915                         goto enospc;
1916                 }
1917         }
1918
1919         /* first, try to make some room in the middle buffer */
1920         if (left) {
1921                 orig_slot += btrfs_header_nritems(left);
1922                 wret = push_node_left(trans, fs_info, left, mid, 1);
1923                 if (wret < 0)
1924                         ret = wret;
1925         }
1926
1927         /*
1928          * then try to empty the right most buffer into the middle
1929          */
1930         if (right) {
1931                 wret = push_node_left(trans, fs_info, mid, right, 1);
1932                 if (wret < 0 && wret != -ENOSPC)
1933                         ret = wret;
1934                 if (btrfs_header_nritems(right) == 0) {
1935                         clean_tree_block(fs_info, right);
1936                         btrfs_tree_unlock(right);
1937                         del_ptr(root, path, level + 1, pslot + 1);
1938                         root_sub_used(root, right->len);
1939                         btrfs_free_tree_block(trans, root, right, 0, 1);
1940                         free_extent_buffer_stale(right);
1941                         right = NULL;
1942                 } else {
1943                         struct btrfs_disk_key right_key;
1944                         btrfs_node_key(right, &right_key, 0);
1945                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1946                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1947                         BUG_ON(ret < 0);
1948                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1949                         btrfs_mark_buffer_dirty(parent);
1950                 }
1951         }
1952         if (btrfs_header_nritems(mid) == 1) {
1953                 /*
1954                  * we're not allowed to leave a node with one item in the
1955                  * tree during a delete.  A deletion from lower in the tree
1956                  * could try to delete the only pointer in this node.
1957                  * So, pull some keys from the left.
1958                  * There has to be a left pointer at this point because
1959                  * otherwise we would have pulled some pointers from the
1960                  * right
1961                  */
1962                 if (!left) {
1963                         ret = -EROFS;
1964                         btrfs_handle_fs_error(fs_info, ret, NULL);
1965                         goto enospc;
1966                 }
1967                 wret = balance_node_right(trans, fs_info, mid, left);
1968                 if (wret < 0) {
1969                         ret = wret;
1970                         goto enospc;
1971                 }
1972                 if (wret == 1) {
1973                         wret = push_node_left(trans, fs_info, left, mid, 1);
1974                         if (wret < 0)
1975                                 ret = wret;
1976                 }
1977                 BUG_ON(wret == 1);
1978         }
1979         if (btrfs_header_nritems(mid) == 0) {
1980                 clean_tree_block(fs_info, mid);
1981                 btrfs_tree_unlock(mid);
1982                 del_ptr(root, path, level + 1, pslot);
1983                 root_sub_used(root, mid->len);
1984                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1985                 free_extent_buffer_stale(mid);
1986                 mid = NULL;
1987         } else {
1988                 /* update the parent key to reflect our changes */
1989                 struct btrfs_disk_key mid_key;
1990                 btrfs_node_key(mid, &mid_key, 0);
1991                 ret = tree_mod_log_insert_key(parent, pslot,
1992                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1993                 BUG_ON(ret < 0);
1994                 btrfs_set_node_key(parent, &mid_key, pslot);
1995                 btrfs_mark_buffer_dirty(parent);
1996         }
1997
1998         /* update the path */
1999         if (left) {
2000                 if (btrfs_header_nritems(left) > orig_slot) {
2001                         extent_buffer_get(left);
2002                         /* left was locked after cow */
2003                         path->nodes[level] = left;
2004                         path->slots[level + 1] -= 1;
2005                         path->slots[level] = orig_slot;
2006                         if (mid) {
2007                                 btrfs_tree_unlock(mid);
2008                                 free_extent_buffer(mid);
2009                         }
2010                 } else {
2011                         orig_slot -= btrfs_header_nritems(left);
2012                         path->slots[level] = orig_slot;
2013                 }
2014         }
2015         /* double check we haven't messed things up */
2016         if (orig_ptr !=
2017             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2018                 BUG();
2019 enospc:
2020         if (right) {
2021                 btrfs_tree_unlock(right);
2022                 free_extent_buffer(right);
2023         }
2024         if (left) {
2025                 if (path->nodes[level] != left)
2026                         btrfs_tree_unlock(left);
2027                 free_extent_buffer(left);
2028         }
2029         return ret;
2030 }
2031
2032 /* Node balancing for insertion.  Here we only split or push nodes around
2033  * when they are completely full.  This is also done top down, so we
2034  * have to be pessimistic.
2035  */
2036 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2037                                           struct btrfs_root *root,
2038                                           struct btrfs_path *path, int level)
2039 {
2040         struct btrfs_fs_info *fs_info = root->fs_info;
2041         struct extent_buffer *right = NULL;
2042         struct extent_buffer *mid;
2043         struct extent_buffer *left = NULL;
2044         struct extent_buffer *parent = NULL;
2045         int ret = 0;
2046         int wret;
2047         int pslot;
2048         int orig_slot = path->slots[level];
2049
2050         if (level == 0)
2051                 return 1;
2052
2053         mid = path->nodes[level];
2054         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2055
2056         if (level < BTRFS_MAX_LEVEL - 1) {
2057                 parent = path->nodes[level + 1];
2058                 pslot = path->slots[level + 1];
2059         }
2060
2061         if (!parent)
2062                 return 1;
2063
2064         left = read_node_slot(fs_info, parent, pslot - 1);
2065         if (IS_ERR(left))
2066                 left = NULL;
2067
2068         /* first, try to make some room in the middle buffer */
2069         if (left) {
2070                 u32 left_nr;
2071
2072                 btrfs_tree_lock(left);
2073                 btrfs_set_lock_blocking(left);
2074
2075                 left_nr = btrfs_header_nritems(left);
2076                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2077                         wret = 1;
2078                 } else {
2079                         ret = btrfs_cow_block(trans, root, left, parent,
2080                                               pslot - 1, &left);
2081                         if (ret)
2082                                 wret = 1;
2083                         else {
2084                                 wret = push_node_left(trans, fs_info,
2085                                                       left, mid, 0);
2086                         }
2087                 }
2088                 if (wret < 0)
2089                         ret = wret;
2090                 if (wret == 0) {
2091                         struct btrfs_disk_key disk_key;
2092                         orig_slot += left_nr;
2093                         btrfs_node_key(mid, &disk_key, 0);
2094                         ret = tree_mod_log_insert_key(parent, pslot,
2095                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2096                         BUG_ON(ret < 0);
2097                         btrfs_set_node_key(parent, &disk_key, pslot);
2098                         btrfs_mark_buffer_dirty(parent);
2099                         if (btrfs_header_nritems(left) > orig_slot) {
2100                                 path->nodes[level] = left;
2101                                 path->slots[level + 1] -= 1;
2102                                 path->slots[level] = orig_slot;
2103                                 btrfs_tree_unlock(mid);
2104                                 free_extent_buffer(mid);
2105                         } else {
2106                                 orig_slot -=
2107                                         btrfs_header_nritems(left);
2108                                 path->slots[level] = orig_slot;
2109                                 btrfs_tree_unlock(left);
2110                                 free_extent_buffer(left);
2111                         }
2112                         return 0;
2113                 }
2114                 btrfs_tree_unlock(left);
2115                 free_extent_buffer(left);
2116         }
2117         right = read_node_slot(fs_info, parent, pslot + 1);
2118         if (IS_ERR(right))
2119                 right = NULL;
2120
2121         /*
2122          * then try to empty the right most buffer into the middle
2123          */
2124         if (right) {
2125                 u32 right_nr;
2126
2127                 btrfs_tree_lock(right);
2128                 btrfs_set_lock_blocking(right);
2129
2130                 right_nr = btrfs_header_nritems(right);
2131                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2132                         wret = 1;
2133                 } else {
2134                         ret = btrfs_cow_block(trans, root, right,
2135                                               parent, pslot + 1,
2136                                               &right);
2137                         if (ret)
2138                                 wret = 1;
2139                         else {
2140                                 wret = balance_node_right(trans, fs_info,
2141                                                           right, mid);
2142                         }
2143                 }
2144                 if (wret < 0)
2145                         ret = wret;
2146                 if (wret == 0) {
2147                         struct btrfs_disk_key disk_key;
2148
2149                         btrfs_node_key(right, &disk_key, 0);
2150                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2151                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2152                         BUG_ON(ret < 0);
2153                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2154                         btrfs_mark_buffer_dirty(parent);
2155
2156                         if (btrfs_header_nritems(mid) <= orig_slot) {
2157                                 path->nodes[level] = right;
2158                                 path->slots[level + 1] += 1;
2159                                 path->slots[level] = orig_slot -
2160                                         btrfs_header_nritems(mid);
2161                                 btrfs_tree_unlock(mid);
2162                                 free_extent_buffer(mid);
2163                         } else {
2164                                 btrfs_tree_unlock(right);
2165                                 free_extent_buffer(right);
2166                         }
2167                         return 0;
2168                 }
2169                 btrfs_tree_unlock(right);
2170                 free_extent_buffer(right);
2171         }
2172         return 1;
2173 }
2174
2175 /*
2176  * readahead one full node of leaves, finding things that are close
2177  * to the block in 'slot', and triggering ra on them.
2178  */
2179 static void reada_for_search(struct btrfs_fs_info *fs_info,
2180                              struct btrfs_path *path,
2181                              int level, int slot, u64 objectid)
2182 {
2183         struct extent_buffer *node;
2184         struct btrfs_disk_key disk_key;
2185         u32 nritems;
2186         u64 search;
2187         u64 target;
2188         u64 nread = 0;
2189         struct extent_buffer *eb;
2190         u32 nr;
2191         u32 blocksize;
2192         u32 nscan = 0;
2193
2194         if (level != 1)
2195                 return;
2196
2197         if (!path->nodes[level])
2198                 return;
2199
2200         node = path->nodes[level];
2201
2202         search = btrfs_node_blockptr(node, slot);
2203         blocksize = fs_info->nodesize;
2204         eb = find_extent_buffer(fs_info, search);
2205         if (eb) {
2206                 free_extent_buffer(eb);
2207                 return;
2208         }
2209
2210         target = search;
2211
2212         nritems = btrfs_header_nritems(node);
2213         nr = slot;
2214
2215         while (1) {
2216                 if (path->reada == READA_BACK) {
2217                         if (nr == 0)
2218                                 break;
2219                         nr--;
2220                 } else if (path->reada == READA_FORWARD) {
2221                         nr++;
2222                         if (nr >= nritems)
2223                                 break;
2224                 }
2225                 if (path->reada == READA_BACK && objectid) {
2226                         btrfs_node_key(node, &disk_key, nr);
2227                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2228                                 break;
2229                 }
2230                 search = btrfs_node_blockptr(node, nr);
2231                 if ((search <= target && target - search <= 65536) ||
2232                     (search > target && search - target <= 65536)) {
2233                         readahead_tree_block(fs_info, search);
2234                         nread += blocksize;
2235                 }
2236                 nscan++;
2237                 if ((nread > 65536 || nscan > 32))
2238                         break;
2239         }
2240 }
2241
2242 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2243                                        struct btrfs_path *path, int level)
2244 {
2245         int slot;
2246         int nritems;
2247         struct extent_buffer *parent;
2248         struct extent_buffer *eb;
2249         u64 gen;
2250         u64 block1 = 0;
2251         u64 block2 = 0;
2252
2253         parent = path->nodes[level + 1];
2254         if (!parent)
2255                 return;
2256
2257         nritems = btrfs_header_nritems(parent);
2258         slot = path->slots[level + 1];
2259
2260         if (slot > 0) {
2261                 block1 = btrfs_node_blockptr(parent, slot - 1);
2262                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2263                 eb = find_extent_buffer(fs_info, block1);
2264                 /*
2265                  * if we get -eagain from btrfs_buffer_uptodate, we
2266                  * don't want to return eagain here.  That will loop
2267                  * forever
2268                  */
2269                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2270                         block1 = 0;
2271                 free_extent_buffer(eb);
2272         }
2273         if (slot + 1 < nritems) {
2274                 block2 = btrfs_node_blockptr(parent, slot + 1);
2275                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2276                 eb = find_extent_buffer(fs_info, block2);
2277                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2278                         block2 = 0;
2279                 free_extent_buffer(eb);
2280         }
2281
2282         if (block1)
2283                 readahead_tree_block(fs_info, block1);
2284         if (block2)
2285                 readahead_tree_block(fs_info, block2);
2286 }
2287
2288
2289 /*
2290  * when we walk down the tree, it is usually safe to unlock the higher layers
2291  * in the tree.  The exceptions are when our path goes through slot 0, because
2292  * operations on the tree might require changing key pointers higher up in the
2293  * tree.
2294  *
2295  * callers might also have set path->keep_locks, which tells this code to keep
2296  * the lock if the path points to the last slot in the block.  This is part of
2297  * walking through the tree, and selecting the next slot in the higher block.
2298  *
2299  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2300  * if lowest_unlock is 1, level 0 won't be unlocked
2301  */
2302 static noinline void unlock_up(struct btrfs_path *path, int level,
2303                                int lowest_unlock, int min_write_lock_level,
2304                                int *write_lock_level)
2305 {
2306         int i;
2307         int skip_level = level;
2308         int no_skips = 0;
2309         struct extent_buffer *t;
2310
2311         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2312                 if (!path->nodes[i])
2313                         break;
2314                 if (!path->locks[i])
2315                         break;
2316                 if (!no_skips && path->slots[i] == 0) {
2317                         skip_level = i + 1;
2318                         continue;
2319                 }
2320                 if (!no_skips && path->keep_locks) {
2321                         u32 nritems;
2322                         t = path->nodes[i];
2323                         nritems = btrfs_header_nritems(t);
2324                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2325                                 skip_level = i + 1;
2326                                 continue;
2327                         }
2328                 }
2329                 if (skip_level < i && i >= lowest_unlock)
2330                         no_skips = 1;
2331
2332                 t = path->nodes[i];
2333                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2334                         btrfs_tree_unlock_rw(t, path->locks[i]);
2335                         path->locks[i] = 0;
2336                         if (write_lock_level &&
2337                             i > min_write_lock_level &&
2338                             i <= *write_lock_level) {
2339                                 *write_lock_level = i - 1;
2340                         }
2341                 }
2342         }
2343 }
2344
2345 /*
2346  * This releases any locks held in the path starting at level and
2347  * going all the way up to the root.
2348  *
2349  * btrfs_search_slot will keep the lock held on higher nodes in a few
2350  * corner cases, such as COW of the block at slot zero in the node.  This
2351  * ignores those rules, and it should only be called when there are no
2352  * more updates to be done higher up in the tree.
2353  */
2354 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2355 {
2356         int i;
2357
2358         if (path->keep_locks)
2359                 return;
2360
2361         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2362                 if (!path->nodes[i])
2363                         continue;
2364                 if (!path->locks[i])
2365                         continue;
2366                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2367                 path->locks[i] = 0;
2368         }
2369 }
2370
2371 /*
2372  * helper function for btrfs_search_slot.  The goal is to find a block
2373  * in cache without setting the path to blocking.  If we find the block
2374  * we return zero and the path is unchanged.
2375  *
2376  * If we can't find the block, we set the path blocking and do some
2377  * reada.  -EAGAIN is returned and the search must be repeated.
2378  */
2379 static int
2380 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2381                       struct extent_buffer **eb_ret, int level, int slot,
2382                       const struct btrfs_key *key)
2383 {
2384         struct btrfs_fs_info *fs_info = root->fs_info;
2385         u64 blocknr;
2386         u64 gen;
2387         struct extent_buffer *b = *eb_ret;
2388         struct extent_buffer *tmp;
2389         struct btrfs_key first_key;
2390         int ret;
2391         int parent_level;
2392
2393         blocknr = btrfs_node_blockptr(b, slot);
2394         gen = btrfs_node_ptr_generation(b, slot);
2395         parent_level = btrfs_header_level(b);
2396         btrfs_node_key_to_cpu(b, &first_key, slot);
2397
2398         tmp = find_extent_buffer(fs_info, blocknr);
2399         if (tmp) {
2400                 /* first we do an atomic uptodate check */
2401                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2402                         *eb_ret = tmp;
2403                         return 0;
2404                 }
2405
2406                 /* the pages were up to date, but we failed
2407                  * the generation number check.  Do a full
2408                  * read for the generation number that is correct.
2409                  * We must do this without dropping locks so
2410                  * we can trust our generation number
2411                  */
2412                 btrfs_set_path_blocking(p);
2413
2414                 /* now we're allowed to do a blocking uptodate check */
2415                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2416                 if (!ret) {
2417                         *eb_ret = tmp;
2418                         return 0;
2419                 }
2420                 free_extent_buffer(tmp);
2421                 btrfs_release_path(p);
2422                 return -EIO;
2423         }
2424
2425         /*
2426          * reduce lock contention at high levels
2427          * of the btree by dropping locks before
2428          * we read.  Don't release the lock on the current
2429          * level because we need to walk this node to figure
2430          * out which blocks to read.
2431          */
2432         btrfs_unlock_up_safe(p, level + 1);
2433         btrfs_set_path_blocking(p);
2434
2435         free_extent_buffer(tmp);
2436         if (p->reada != READA_NONE)
2437                 reada_for_search(fs_info, p, level, slot, key->objectid);
2438
2439         btrfs_release_path(p);
2440
2441         ret = -EAGAIN;
2442         tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
2443                               &first_key);
2444         if (!IS_ERR(tmp)) {
2445                 /*
2446                  * If the read above didn't mark this buffer up to date,
2447                  * it will never end up being up to date.  Set ret to EIO now
2448                  * and give up so that our caller doesn't loop forever
2449                  * on our EAGAINs.
2450                  */
2451                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2452                         ret = -EIO;
2453                 free_extent_buffer(tmp);
2454         } else {
2455                 ret = PTR_ERR(tmp);
2456         }
2457         return ret;
2458 }
2459
2460 /*
2461  * helper function for btrfs_search_slot.  This does all of the checks
2462  * for node-level blocks and does any balancing required based on
2463  * the ins_len.
2464  *
2465  * If no extra work was required, zero is returned.  If we had to
2466  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2467  * start over
2468  */
2469 static int
2470 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2471                        struct btrfs_root *root, struct btrfs_path *p,
2472                        struct extent_buffer *b, int level, int ins_len,
2473                        int *write_lock_level)
2474 {
2475         struct btrfs_fs_info *fs_info = root->fs_info;
2476         int ret;
2477
2478         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2479             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2480                 int sret;
2481
2482                 if (*write_lock_level < level + 1) {
2483                         *write_lock_level = level + 1;
2484                         btrfs_release_path(p);
2485                         goto again;
2486                 }
2487
2488                 btrfs_set_path_blocking(p);
2489                 reada_for_balance(fs_info, p, level);
2490                 sret = split_node(trans, root, p, level);
2491                 btrfs_clear_path_blocking(p, NULL, 0);
2492
2493                 BUG_ON(sret > 0);
2494                 if (sret) {
2495                         ret = sret;
2496                         goto done;
2497                 }
2498                 b = p->nodes[level];
2499         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2500                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2501                 int sret;
2502
2503                 if (*write_lock_level < level + 1) {
2504                         *write_lock_level = level + 1;
2505                         btrfs_release_path(p);
2506                         goto again;
2507                 }
2508
2509                 btrfs_set_path_blocking(p);
2510                 reada_for_balance(fs_info, p, level);
2511                 sret = balance_level(trans, root, p, level);
2512                 btrfs_clear_path_blocking(p, NULL, 0);
2513
2514                 if (sret) {
2515                         ret = sret;
2516                         goto done;
2517                 }
2518                 b = p->nodes[level];
2519                 if (!b) {
2520                         btrfs_release_path(p);
2521                         goto again;
2522                 }
2523                 BUG_ON(btrfs_header_nritems(b) == 1);
2524         }
2525         return 0;
2526
2527 again:
2528         ret = -EAGAIN;
2529 done:
2530         return ret;
2531 }
2532
2533 static void key_search_validate(struct extent_buffer *b,
2534                                 const struct btrfs_key *key,
2535                                 int level)
2536 {
2537 #ifdef CONFIG_BTRFS_ASSERT
2538         struct btrfs_disk_key disk_key;
2539
2540         btrfs_cpu_key_to_disk(&disk_key, key);
2541
2542         if (level == 0)
2543                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2544                     offsetof(struct btrfs_leaf, items[0].key),
2545                     sizeof(disk_key)));
2546         else
2547                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2548                     offsetof(struct btrfs_node, ptrs[0].key),
2549                     sizeof(disk_key)));
2550 #endif
2551 }
2552
2553 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2554                       int level, int *prev_cmp, int *slot)
2555 {
2556         if (*prev_cmp != 0) {
2557                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2558                 return *prev_cmp;
2559         }
2560
2561         key_search_validate(b, key, level);
2562         *slot = 0;
2563
2564         return 0;
2565 }
2566
2567 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2568                 u64 iobjectid, u64 ioff, u8 key_type,
2569                 struct btrfs_key *found_key)
2570 {
2571         int ret;
2572         struct btrfs_key key;
2573         struct extent_buffer *eb;
2574
2575         ASSERT(path);
2576         ASSERT(found_key);
2577
2578         key.type = key_type;
2579         key.objectid = iobjectid;
2580         key.offset = ioff;
2581
2582         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2583         if (ret < 0)
2584                 return ret;
2585
2586         eb = path->nodes[0];
2587         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2588                 ret = btrfs_next_leaf(fs_root, path);
2589                 if (ret)
2590                         return ret;
2591                 eb = path->nodes[0];
2592         }
2593
2594         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2595         if (found_key->type != key.type ||
2596                         found_key->objectid != key.objectid)
2597                 return 1;
2598
2599         return 0;
2600 }
2601
2602 /*
2603  * btrfs_search_slot - look for a key in a tree and perform necessary
2604  * modifications to preserve tree invariants.
2605  *
2606  * @trans:      Handle of transaction, used when modifying the tree
2607  * @p:          Holds all btree nodes along the search path
2608  * @root:       The root node of the tree
2609  * @key:        The key we are looking for
2610  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2611  *              deletions it's -1. 0 for plain searches
2612  * @cow:        boolean should CoW operations be performed. Must always be 1
2613  *              when modifying the tree.
2614  *
2615  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2616  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2617  *
2618  * If @key is found, 0 is returned and you can find the item in the leaf level
2619  * of the path (level 0)
2620  *
2621  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2622  * points to the slot where it should be inserted
2623  *
2624  * If an error is encountered while searching the tree a negative error number
2625  * is returned
2626  */
2627 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2628                       const struct btrfs_key *key, struct btrfs_path *p,
2629                       int ins_len, int cow)
2630 {
2631         struct btrfs_fs_info *fs_info = root->fs_info;
2632         struct extent_buffer *b;
2633         int slot;
2634         int ret;
2635         int err;
2636         int level;
2637         int lowest_unlock = 1;
2638         int root_lock;
2639         /* everything at write_lock_level or lower must be write locked */
2640         int write_lock_level = 0;
2641         u8 lowest_level = 0;
2642         int min_write_lock_level;
2643         int prev_cmp;
2644
2645         lowest_level = p->lowest_level;
2646         WARN_ON(lowest_level && ins_len > 0);
2647         WARN_ON(p->nodes[0] != NULL);
2648         BUG_ON(!cow && ins_len);
2649
2650         if (ins_len < 0) {
2651                 lowest_unlock = 2;
2652
2653                 /* when we are removing items, we might have to go up to level
2654                  * two as we update tree pointers  Make sure we keep write
2655                  * for those levels as well
2656                  */
2657                 write_lock_level = 2;
2658         } else if (ins_len > 0) {
2659                 /*
2660                  * for inserting items, make sure we have a write lock on
2661                  * level 1 so we can update keys
2662                  */
2663                 write_lock_level = 1;
2664         }
2665
2666         if (!cow)
2667                 write_lock_level = -1;
2668
2669         if (cow && (p->keep_locks || p->lowest_level))
2670                 write_lock_level = BTRFS_MAX_LEVEL;
2671
2672         min_write_lock_level = write_lock_level;
2673
2674 again:
2675         prev_cmp = -1;
2676         /*
2677          * we try very hard to do read locks on the root
2678          */
2679         root_lock = BTRFS_READ_LOCK;
2680         level = 0;
2681         if (p->search_commit_root) {
2682                 /*
2683                  * the commit roots are read only
2684                  * so we always do read locks
2685                  */
2686                 if (p->need_commit_sem)
2687                         down_read(&fs_info->commit_root_sem);
2688                 b = root->commit_root;
2689                 extent_buffer_get(b);
2690                 level = btrfs_header_level(b);
2691                 if (p->need_commit_sem)
2692                         up_read(&fs_info->commit_root_sem);
2693                 if (!p->skip_locking)
2694                         btrfs_tree_read_lock(b);
2695         } else {
2696                 if (p->skip_locking) {
2697                         b = btrfs_root_node(root);
2698                         level = btrfs_header_level(b);
2699                 } else {
2700                         /* we don't know the level of the root node
2701                          * until we actually have it read locked
2702                          */
2703                         b = btrfs_read_lock_root_node(root);
2704                         level = btrfs_header_level(b);
2705                         if (level <= write_lock_level) {
2706                                 /* whoops, must trade for write lock */
2707                                 btrfs_tree_read_unlock(b);
2708                                 free_extent_buffer(b);
2709                                 b = btrfs_lock_root_node(root);
2710                                 root_lock = BTRFS_WRITE_LOCK;
2711
2712                                 /* the level might have changed, check again */
2713                                 level = btrfs_header_level(b);
2714                         }
2715                 }
2716         }
2717         p->nodes[level] = b;
2718         if (!p->skip_locking)
2719                 p->locks[level] = root_lock;
2720
2721         while (b) {
2722                 level = btrfs_header_level(b);
2723
2724                 /*
2725                  * setup the path here so we can release it under lock
2726                  * contention with the cow code
2727                  */
2728                 if (cow) {
2729                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2730
2731                         /*
2732                          * if we don't really need to cow this block
2733                          * then we don't want to set the path blocking,
2734                          * so we test it here
2735                          */
2736                         if (!should_cow_block(trans, root, b)) {
2737                                 trans->dirty = true;
2738                                 goto cow_done;
2739                         }
2740
2741                         /*
2742                          * must have write locks on this node and the
2743                          * parent
2744                          */
2745                         if (level > write_lock_level ||
2746                             (level + 1 > write_lock_level &&
2747                             level + 1 < BTRFS_MAX_LEVEL &&
2748                             p->nodes[level + 1])) {
2749                                 write_lock_level = level + 1;
2750                                 btrfs_release_path(p);
2751                                 goto again;
2752                         }
2753
2754                         btrfs_set_path_blocking(p);
2755                         if (last_level)
2756                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2757                                                       &b);
2758                         else
2759                                 err = btrfs_cow_block(trans, root, b,
2760                                                       p->nodes[level + 1],
2761                                                       p->slots[level + 1], &b);
2762                         if (err) {
2763                                 ret = err;
2764                                 goto done;
2765                         }
2766                 }
2767 cow_done:
2768                 p->nodes[level] = b;
2769                 btrfs_clear_path_blocking(p, NULL, 0);
2770
2771                 /*
2772                  * we have a lock on b and as long as we aren't changing
2773                  * the tree, there is no way to for the items in b to change.
2774                  * It is safe to drop the lock on our parent before we
2775                  * go through the expensive btree search on b.
2776                  *
2777                  * If we're inserting or deleting (ins_len != 0), then we might
2778                  * be changing slot zero, which may require changing the parent.
2779                  * So, we can't drop the lock until after we know which slot
2780                  * we're operating on.
2781                  */
2782                 if (!ins_len && !p->keep_locks) {
2783                         int u = level + 1;
2784
2785                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2786                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2787                                 p->locks[u] = 0;
2788                         }
2789                 }
2790
2791                 ret = key_search(b, key, level, &prev_cmp, &slot);
2792                 if (ret < 0)
2793                         goto done;
2794
2795                 if (level != 0) {
2796                         int dec = 0;
2797                         if (ret && slot > 0) {
2798                                 dec = 1;
2799                                 slot -= 1;
2800                         }
2801                         p->slots[level] = slot;
2802                         err = setup_nodes_for_search(trans, root, p, b, level,
2803                                              ins_len, &write_lock_level);
2804                         if (err == -EAGAIN)
2805                                 goto again;
2806                         if (err) {
2807                                 ret = err;
2808                                 goto done;
2809                         }
2810                         b = p->nodes[level];
2811                         slot = p->slots[level];
2812
2813                         /*
2814                          * slot 0 is special, if we change the key
2815                          * we have to update the parent pointer
2816                          * which means we must have a write lock
2817                          * on the parent
2818                          */
2819                         if (slot == 0 && ins_len &&
2820                             write_lock_level < level + 1) {
2821                                 write_lock_level = level + 1;
2822                                 btrfs_release_path(p);
2823                                 goto again;
2824                         }
2825
2826                         unlock_up(p, level, lowest_unlock,
2827                                   min_write_lock_level, &write_lock_level);
2828
2829                         if (level == lowest_level) {
2830                                 if (dec)
2831                                         p->slots[level]++;
2832                                 goto done;
2833                         }
2834
2835                         err = read_block_for_search(root, p, &b, level,
2836                                                     slot, key);
2837                         if (err == -EAGAIN)
2838                                 goto again;
2839                         if (err) {
2840                                 ret = err;
2841                                 goto done;
2842                         }
2843
2844                         if (!p->skip_locking) {
2845                                 level = btrfs_header_level(b);
2846                                 if (level <= write_lock_level) {
2847                                         err = btrfs_try_tree_write_lock(b);
2848                                         if (!err) {
2849                                                 btrfs_set_path_blocking(p);
2850                                                 btrfs_tree_lock(b);
2851                                                 btrfs_clear_path_blocking(p, b,
2852                                                                   BTRFS_WRITE_LOCK);
2853                                         }
2854                                         p->locks[level] = BTRFS_WRITE_LOCK;
2855                                 } else {
2856                                         err = btrfs_tree_read_lock_atomic(b);
2857                                         if (!err) {
2858                                                 btrfs_set_path_blocking(p);
2859                                                 btrfs_tree_read_lock(b);
2860                                                 btrfs_clear_path_blocking(p, b,
2861                                                                   BTRFS_READ_LOCK);
2862                                         }
2863                                         p->locks[level] = BTRFS_READ_LOCK;
2864                                 }
2865                                 p->nodes[level] = b;
2866                         }
2867                 } else {
2868                         p->slots[level] = slot;
2869                         if (ins_len > 0 &&
2870                             btrfs_leaf_free_space(fs_info, b) < ins_len) {
2871                                 if (write_lock_level < 1) {
2872                                         write_lock_level = 1;
2873                                         btrfs_release_path(p);
2874                                         goto again;
2875                                 }
2876
2877                                 btrfs_set_path_blocking(p);
2878                                 err = split_leaf(trans, root, key,
2879                                                  p, ins_len, ret == 0);
2880                                 btrfs_clear_path_blocking(p, NULL, 0);
2881
2882                                 BUG_ON(err > 0);
2883                                 if (err) {
2884                                         ret = err;
2885                                         goto done;
2886                                 }
2887                         }
2888                         if (!p->search_for_split)
2889                                 unlock_up(p, level, lowest_unlock,
2890                                           min_write_lock_level, &write_lock_level);
2891                         goto done;
2892                 }
2893         }
2894         ret = 1;
2895 done:
2896         /*
2897          * we don't really know what they plan on doing with the path
2898          * from here on, so for now just mark it as blocking
2899          */
2900         if (!p->leave_spinning)
2901                 btrfs_set_path_blocking(p);
2902         if (ret < 0 && !p->skip_release_on_error)
2903                 btrfs_release_path(p);
2904         return ret;
2905 }
2906
2907 /*
2908  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2909  * current state of the tree together with the operations recorded in the tree
2910  * modification log to search for the key in a previous version of this tree, as
2911  * denoted by the time_seq parameter.
2912  *
2913  * Naturally, there is no support for insert, delete or cow operations.
2914  *
2915  * The resulting path and return value will be set up as if we called
2916  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2917  */
2918 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2919                           struct btrfs_path *p, u64 time_seq)
2920 {
2921         struct btrfs_fs_info *fs_info = root->fs_info;
2922         struct extent_buffer *b;
2923         int slot;
2924         int ret;
2925         int err;
2926         int level;
2927         int lowest_unlock = 1;
2928         u8 lowest_level = 0;
2929         int prev_cmp = -1;
2930
2931         lowest_level = p->lowest_level;
2932         WARN_ON(p->nodes[0] != NULL);
2933
2934         if (p->search_commit_root) {
2935                 BUG_ON(time_seq);
2936                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2937         }
2938
2939 again:
2940         b = get_old_root(root, time_seq);
2941         level = btrfs_header_level(b);
2942         p->locks[level] = BTRFS_READ_LOCK;
2943
2944         while (b) {
2945                 level = btrfs_header_level(b);
2946                 p->nodes[level] = b;
2947                 btrfs_clear_path_blocking(p, NULL, 0);
2948
2949                 /*
2950                  * we have a lock on b and as long as we aren't changing
2951                  * the tree, there is no way to for the items in b to change.
2952                  * It is safe to drop the lock on our parent before we
2953                  * go through the expensive btree search on b.
2954                  */
2955                 btrfs_unlock_up_safe(p, level + 1);
2956
2957                 /*
2958                  * Since we can unwind ebs we want to do a real search every
2959                  * time.
2960                  */
2961                 prev_cmp = -1;
2962                 ret = key_search(b, key, level, &prev_cmp, &slot);
2963
2964                 if (level != 0) {
2965                         int dec = 0;
2966                         if (ret && slot > 0) {
2967                                 dec = 1;
2968                                 slot -= 1;
2969                         }
2970                         p->slots[level] = slot;
2971                         unlock_up(p, level, lowest_unlock, 0, NULL);
2972
2973                         if (level == lowest_level) {
2974                                 if (dec)
2975                                         p->slots[level]++;
2976                                 goto done;
2977                         }
2978
2979                         err = read_block_for_search(root, p, &b, level,
2980                                                     slot, key);
2981                         if (err == -EAGAIN)
2982                                 goto again;
2983                         if (err) {
2984                                 ret = err;
2985                                 goto done;
2986                         }
2987
2988                         level = btrfs_header_level(b);
2989                         err = btrfs_tree_read_lock_atomic(b);
2990                         if (!err) {
2991                                 btrfs_set_path_blocking(p);
2992                                 btrfs_tree_read_lock(b);
2993                                 btrfs_clear_path_blocking(p, b,
2994                                                           BTRFS_READ_LOCK);
2995                         }
2996                         b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2997                         if (!b) {
2998                                 ret = -ENOMEM;
2999                                 goto done;
3000                         }
3001                         p->locks[level] = BTRFS_READ_LOCK;
3002                         p->nodes[level] = b;
3003                 } else {
3004                         p->slots[level] = slot;
3005                         unlock_up(p, level, lowest_unlock, 0, NULL);
3006                         goto done;
3007                 }
3008         }
3009         ret = 1;
3010 done:
3011         if (!p->leave_spinning)
3012                 btrfs_set_path_blocking(p);
3013         if (ret < 0)
3014                 btrfs_release_path(p);
3015
3016         return ret;
3017 }
3018
3019 /*
3020  * helper to use instead of search slot if no exact match is needed but
3021  * instead the next or previous item should be returned.
3022  * When find_higher is true, the next higher item is returned, the next lower
3023  * otherwise.
3024  * When return_any and find_higher are both true, and no higher item is found,
3025  * return the next lower instead.
3026  * When return_any is true and find_higher is false, and no lower item is found,
3027  * return the next higher instead.
3028  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3029  * < 0 on error
3030  */
3031 int btrfs_search_slot_for_read(struct btrfs_root *root,
3032                                const struct btrfs_key *key,
3033                                struct btrfs_path *p, int find_higher,
3034                                int return_any)
3035 {
3036         int ret;
3037         struct extent_buffer *leaf;
3038
3039 again:
3040         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3041         if (ret <= 0)
3042                 return ret;
3043         /*
3044          * a return value of 1 means the path is at the position where the
3045          * item should be inserted. Normally this is the next bigger item,
3046          * but in case the previous item is the last in a leaf, path points
3047          * to the first free slot in the previous leaf, i.e. at an invalid
3048          * item.
3049          */
3050         leaf = p->nodes[0];
3051
3052         if (find_higher) {
3053                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3054                         ret = btrfs_next_leaf(root, p);
3055                         if (ret <= 0)
3056                                 return ret;
3057                         if (!return_any)
3058                                 return 1;
3059                         /*
3060                          * no higher item found, return the next
3061                          * lower instead
3062                          */
3063                         return_any = 0;
3064                         find_higher = 0;
3065                         btrfs_release_path(p);
3066                         goto again;
3067                 }
3068         } else {
3069                 if (p->slots[0] == 0) {
3070                         ret = btrfs_prev_leaf(root, p);
3071                         if (ret < 0)
3072                                 return ret;
3073                         if (!ret) {
3074                                 leaf = p->nodes[0];
3075                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3076                                         p->slots[0]--;
3077                                 return 0;
3078                         }
3079                         if (!return_any)
3080                                 return 1;
3081                         /*
3082                          * no lower item found, return the next
3083                          * higher instead
3084                          */
3085                         return_any = 0;
3086                         find_higher = 1;
3087                         btrfs_release_path(p);
3088                         goto again;
3089                 } else {
3090                         --p->slots[0];
3091                 }
3092         }
3093         return 0;
3094 }
3095
3096 /*
3097  * adjust the pointers going up the tree, starting at level
3098  * making sure the right key of each node is points to 'key'.
3099  * This is used after shifting pointers to the left, so it stops
3100  * fixing up pointers when a given leaf/node is not in slot 0 of the
3101  * higher levels
3102  *
3103  */
3104 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3105                            struct btrfs_path *path,
3106                            struct btrfs_disk_key *key, int level)
3107 {
3108         int i;
3109         struct extent_buffer *t;
3110         int ret;
3111
3112         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3113                 int tslot = path->slots[i];
3114
3115                 if (!path->nodes[i])
3116                         break;
3117                 t = path->nodes[i];
3118                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3119                                 GFP_ATOMIC);
3120                 BUG_ON(ret < 0);
3121                 btrfs_set_node_key(t, key, tslot);
3122                 btrfs_mark_buffer_dirty(path->nodes[i]);
3123                 if (tslot != 0)
3124                         break;
3125         }
3126 }
3127
3128 /*
3129  * update item key.
3130  *
3131  * This function isn't completely safe. It's the caller's responsibility
3132  * that the new key won't break the order
3133  */
3134 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3135                              struct btrfs_path *path,
3136                              const struct btrfs_key *new_key)
3137 {
3138         struct btrfs_disk_key disk_key;
3139         struct extent_buffer *eb;
3140         int slot;
3141
3142         eb = path->nodes[0];
3143         slot = path->slots[0];
3144         if (slot > 0) {
3145                 btrfs_item_key(eb, &disk_key, slot - 1);
3146                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3147         }
3148         if (slot < btrfs_header_nritems(eb) - 1) {
3149                 btrfs_item_key(eb, &disk_key, slot + 1);
3150                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3151         }
3152
3153         btrfs_cpu_key_to_disk(&disk_key, new_key);
3154         btrfs_set_item_key(eb, &disk_key, slot);
3155         btrfs_mark_buffer_dirty(eb);
3156         if (slot == 0)
3157                 fixup_low_keys(fs_info, path, &disk_key, 1);
3158 }
3159
3160 /*
3161  * try to push data from one node into the next node left in the
3162  * tree.
3163  *
3164  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3165  * error, and > 0 if there was no room in the left hand block.
3166  */
3167 static int push_node_left(struct btrfs_trans_handle *trans,
3168                           struct btrfs_fs_info *fs_info,
3169                           struct extent_buffer *dst,
3170                           struct extent_buffer *src, int empty)
3171 {
3172         int push_items = 0;
3173         int src_nritems;
3174         int dst_nritems;
3175         int ret = 0;
3176
3177         src_nritems = btrfs_header_nritems(src);
3178         dst_nritems = btrfs_header_nritems(dst);
3179         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3180         WARN_ON(btrfs_header_generation(src) != trans->transid);
3181         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3182
3183         if (!empty && src_nritems <= 8)
3184                 return 1;
3185
3186         if (push_items <= 0)
3187                 return 1;
3188
3189         if (empty) {
3190                 push_items = min(src_nritems, push_items);
3191                 if (push_items < src_nritems) {
3192                         /* leave at least 8 pointers in the node if
3193                          * we aren't going to empty it
3194                          */
3195                         if (src_nritems - push_items < 8) {
3196                                 if (push_items <= 8)
3197                                         return 1;
3198                                 push_items -= 8;
3199                         }
3200                 }
3201         } else
3202                 push_items = min(src_nritems - 8, push_items);
3203
3204         ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3205                                    push_items);
3206         if (ret) {
3207                 btrfs_abort_transaction(trans, ret);
3208                 return ret;
3209         }
3210         copy_extent_buffer(dst, src,
3211                            btrfs_node_key_ptr_offset(dst_nritems),
3212                            btrfs_node_key_ptr_offset(0),
3213                            push_items * sizeof(struct btrfs_key_ptr));
3214
3215         if (push_items < src_nritems) {
3216                 /*
3217                  * Don't call tree_mod_log_insert_move here, key removal was
3218                  * already fully logged by tree_mod_log_eb_copy above.
3219                  */
3220                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3221                                       btrfs_node_key_ptr_offset(push_items),
3222                                       (src_nritems - push_items) *
3223                                       sizeof(struct btrfs_key_ptr));
3224         }
3225         btrfs_set_header_nritems(src, src_nritems - push_items);
3226         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3227         btrfs_mark_buffer_dirty(src);
3228         btrfs_mark_buffer_dirty(dst);
3229
3230         return ret;
3231 }
3232
3233 /*
3234  * try to push data from one node into the next node right in the
3235  * tree.
3236  *
3237  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3238  * error, and > 0 if there was no room in the right hand block.
3239  *
3240  * this will  only push up to 1/2 the contents of the left node over
3241  */
3242 static int balance_node_right(struct btrfs_trans_handle *trans,
3243                               struct btrfs_fs_info *fs_info,
3244                               struct extent_buffer *dst,
3245                               struct extent_buffer *src)
3246 {
3247         int push_items = 0;
3248         int max_push;
3249         int src_nritems;
3250         int dst_nritems;
3251         int ret = 0;
3252
3253         WARN_ON(btrfs_header_generation(src) != trans->transid);
3254         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3255
3256         src_nritems = btrfs_header_nritems(src);
3257         dst_nritems = btrfs_header_nritems(dst);
3258         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3259         if (push_items <= 0)
3260                 return 1;
3261
3262         if (src_nritems < 4)
3263                 return 1;
3264
3265         max_push = src_nritems / 2 + 1;
3266         /* don't try to empty the node */
3267         if (max_push >= src_nritems)
3268                 return 1;
3269
3270         if (max_push < push_items)
3271                 push_items = max_push;
3272
3273         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3274         BUG_ON(ret < 0);
3275         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3276                                       btrfs_node_key_ptr_offset(0),
3277                                       (dst_nritems) *
3278                                       sizeof(struct btrfs_key_ptr));
3279
3280         ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3281                                    src_nritems - push_items, push_items);
3282         if (ret) {
3283                 btrfs_abort_transaction(trans, ret);
3284                 return ret;
3285         }
3286         copy_extent_buffer(dst, src,
3287                            btrfs_node_key_ptr_offset(0),
3288                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3289                            push_items * sizeof(struct btrfs_key_ptr));
3290
3291         btrfs_set_header_nritems(src, src_nritems - push_items);
3292         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3293
3294         btrfs_mark_buffer_dirty(src);
3295         btrfs_mark_buffer_dirty(dst);
3296
3297         return ret;
3298 }
3299
3300 /*
3301  * helper function to insert a new root level in the tree.
3302  * A new node is allocated, and a single item is inserted to
3303  * point to the existing root
3304  *
3305  * returns zero on success or < 0 on failure.
3306  */
3307 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3308                            struct btrfs_root *root,
3309                            struct btrfs_path *path, int level)
3310 {
3311         struct btrfs_fs_info *fs_info = root->fs_info;
3312         u64 lower_gen;
3313         struct extent_buffer *lower;
3314         struct extent_buffer *c;
3315         struct extent_buffer *old;
3316         struct btrfs_disk_key lower_key;
3317         int ret;
3318
3319         BUG_ON(path->nodes[level]);
3320         BUG_ON(path->nodes[level-1] != root->node);
3321
3322         lower = path->nodes[level-1];
3323         if (level == 1)
3324                 btrfs_item_key(lower, &lower_key, 0);
3325         else
3326                 btrfs_node_key(lower, &lower_key, 0);
3327
3328         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3329                                    &lower_key, level, root->node->start, 0);
3330         if (IS_ERR(c))
3331                 return PTR_ERR(c);
3332
3333         root_add_used(root, fs_info->nodesize);
3334
3335         memzero_extent_buffer(c, 0, sizeof(struct btrfs_header));
3336         btrfs_set_header_nritems(c, 1);
3337         btrfs_set_header_level(c, level);
3338         btrfs_set_header_bytenr(c, c->start);
3339         btrfs_set_header_generation(c, trans->transid);
3340         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3341         btrfs_set_header_owner(c, root->root_key.objectid);
3342
3343         write_extent_buffer_fsid(c, fs_info->fsid);
3344         write_extent_buffer_chunk_tree_uuid(c, fs_info->chunk_tree_uuid);
3345
3346         btrfs_set_node_key(c, &lower_key, 0);
3347         btrfs_set_node_blockptr(c, 0, lower->start);
3348         lower_gen = btrfs_header_generation(lower);
3349         WARN_ON(lower_gen != trans->transid);
3350
3351         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3352
3353         btrfs_mark_buffer_dirty(c);
3354
3355         old = root->node;
3356         ret = tree_mod_log_insert_root(root->node, c, 0);
3357         BUG_ON(ret < 0);
3358         rcu_assign_pointer(root->node, c);
3359
3360         /* the super has an extra ref to root->node */
3361         free_extent_buffer(old);
3362
3363         add_root_to_dirty_list(root);
3364         extent_buffer_get(c);
3365         path->nodes[level] = c;
3366         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3367         path->slots[level] = 0;
3368         return 0;
3369 }
3370
3371 /*
3372  * worker function to insert a single pointer in a node.
3373  * the node should have enough room for the pointer already
3374  *
3375  * slot and level indicate where you want the key to go, and
3376  * blocknr is the block the key points to.
3377  */
3378 static void insert_ptr(struct btrfs_trans_handle *trans,
3379                        struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3380                        struct btrfs_disk_key *key, u64 bytenr,
3381                        int slot, int level)
3382 {
3383         struct extent_buffer *lower;
3384         int nritems;
3385         int ret;
3386
3387         BUG_ON(!path->nodes[level]);
3388         btrfs_assert_tree_locked(path->nodes[level]);
3389         lower = path->nodes[level];
3390         nritems = btrfs_header_nritems(lower);
3391         BUG_ON(slot > nritems);
3392         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3393         if (slot != nritems) {
3394                 if (level) {
3395                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3396                                         nritems - slot);
3397                         BUG_ON(ret < 0);
3398                 }
3399                 memmove_extent_buffer(lower,
3400                               btrfs_node_key_ptr_offset(slot + 1),
3401                               btrfs_node_key_ptr_offset(slot),
3402                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3403         }
3404         if (level) {
3405                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3406                                 GFP_NOFS);
3407                 BUG_ON(ret < 0);
3408         }
3409         btrfs_set_node_key(lower, key, slot);
3410         btrfs_set_node_blockptr(lower, slot, bytenr);
3411         WARN_ON(trans->transid == 0);
3412         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3413         btrfs_set_header_nritems(lower, nritems + 1);
3414         btrfs_mark_buffer_dirty(lower);
3415 }
3416
3417 /*
3418  * split the node at the specified level in path in two.
3419  * The path is corrected to point to the appropriate node after the split
3420  *
3421  * Before splitting this tries to make some room in the node by pushing
3422  * left and right, if either one works, it returns right away.
3423  *
3424  * returns 0 on success and < 0 on failure
3425  */
3426 static noinline int split_node(struct btrfs_trans_handle *trans,
3427                                struct btrfs_root *root,
3428                                struct btrfs_path *path, int level)
3429 {
3430         struct btrfs_fs_info *fs_info = root->fs_info;
3431         struct extent_buffer *c;
3432         struct extent_buffer *split;
3433         struct btrfs_disk_key disk_key;
3434         int mid;
3435         int ret;
3436         u32 c_nritems;
3437
3438         c = path->nodes[level];
3439         WARN_ON(btrfs_header_generation(c) != trans->transid);
3440         if (c == root->node) {
3441                 /*
3442                  * trying to split the root, lets make a new one
3443                  *
3444                  * tree mod log: We don't log_removal old root in
3445                  * insert_new_root, because that root buffer will be kept as a
3446                  * normal node. We are going to log removal of half of the
3447                  * elements below with tree_mod_log_eb_copy. We're holding a
3448                  * tree lock on the buffer, which is why we cannot race with
3449                  * other tree_mod_log users.
3450                  */
3451                 ret = insert_new_root(trans, root, path, level + 1);
3452                 if (ret)
3453                         return ret;
3454         } else {
3455                 ret = push_nodes_for_insert(trans, root, path, level);
3456                 c = path->nodes[level];
3457                 if (!ret && btrfs_header_nritems(c) <
3458                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3459                         return 0;
3460                 if (ret < 0)
3461                         return ret;
3462         }
3463
3464         c_nritems = btrfs_header_nritems(c);
3465         mid = (c_nritems + 1) / 2;
3466         btrfs_node_key(c, &disk_key, mid);
3467
3468         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3469                         &disk_key, level, c->start, 0);
3470         if (IS_ERR(split))
3471                 return PTR_ERR(split);
3472
3473         root_add_used(root, fs_info->nodesize);
3474
3475         memzero_extent_buffer(split, 0, sizeof(struct btrfs_header));
3476         btrfs_set_header_level(split, btrfs_header_level(c));
3477         btrfs_set_header_bytenr(split, split->start);
3478         btrfs_set_header_generation(split, trans->transid);
3479         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3480         btrfs_set_header_owner(split, root->root_key.objectid);
3481         write_extent_buffer_fsid(split, fs_info->fsid);
3482         write_extent_buffer_chunk_tree_uuid(split, fs_info->chunk_tree_uuid);
3483
3484         ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3485         if (ret) {
3486                 btrfs_abort_transaction(trans, ret);
3487                 return ret;
3488         }
3489         copy_extent_buffer(split, c,
3490                            btrfs_node_key_ptr_offset(0),
3491                            btrfs_node_key_ptr_offset(mid),
3492                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3493         btrfs_set_header_nritems(split, c_nritems - mid);
3494         btrfs_set_header_nritems(c, mid);
3495         ret = 0;
3496
3497         btrfs_mark_buffer_dirty(c);
3498         btrfs_mark_buffer_dirty(split);
3499
3500         insert_ptr(trans, fs_info, path, &disk_key, split->start,
3501                    path->slots[level + 1] + 1, level + 1);
3502
3503         if (path->slots[level] >= mid) {
3504                 path->slots[level] -= mid;
3505                 btrfs_tree_unlock(c);
3506                 free_extent_buffer(c);
3507                 path->nodes[level] = split;
3508                 path->slots[level + 1] += 1;
3509         } else {
3510                 btrfs_tree_unlock(split);
3511                 free_extent_buffer(split);
3512         }
3513         return ret;
3514 }
3515
3516 /*
3517  * how many bytes are required to store the items in a leaf.  start
3518  * and nr indicate which items in the leaf to check.  This totals up the
3519  * space used both by the item structs and the item data
3520  */
3521 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3522 {
3523         struct btrfs_item *start_item;
3524         struct btrfs_item *end_item;
3525         struct btrfs_map_token token;
3526         int data_len;
3527         int nritems = btrfs_header_nritems(l);
3528         int end = min(nritems, start + nr) - 1;
3529
3530         if (!nr)
3531                 return 0;
3532         btrfs_init_map_token(&token);
3533         start_item = btrfs_item_nr(start);
3534         end_item = btrfs_item_nr(end);
3535         data_len = btrfs_token_item_offset(l, start_item, &token) +
3536                 btrfs_token_item_size(l, start_item, &token);
3537         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3538         data_len += sizeof(struct btrfs_item) * nr;
3539         WARN_ON(data_len < 0);
3540         return data_len;
3541 }
3542
3543 /*
3544  * The space between the end of the leaf items and
3545  * the start of the leaf data.  IOW, how much room
3546  * the leaf has left for both items and data
3547  */
3548 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3549                                    struct extent_buffer *leaf)
3550 {
3551         int nritems = btrfs_header_nritems(leaf);
3552         int ret;
3553
3554         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3555         if (ret < 0) {
3556                 btrfs_crit(fs_info,
3557                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3558                            ret,
3559                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3560                            leaf_space_used(leaf, 0, nritems), nritems);
3561         }
3562         return ret;
3563 }
3564
3565 /*
3566  * min slot controls the lowest index we're willing to push to the
3567  * right.  We'll push up to and including min_slot, but no lower
3568  */
3569 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3570                                       struct btrfs_path *path,
3571                                       int data_size, int empty,
3572                                       struct extent_buffer *right,
3573                                       int free_space, u32 left_nritems,
3574                                       u32 min_slot)
3575 {
3576         struct extent_buffer *left = path->nodes[0];
3577         struct extent_buffer *upper = path->nodes[1];
3578         struct btrfs_map_token token;
3579         struct btrfs_disk_key disk_key;
3580         int slot;
3581         u32 i;
3582         int push_space = 0;
3583         int push_items = 0;
3584         struct btrfs_item *item;
3585         u32 nr;
3586         u32 right_nritems;
3587         u32 data_end;
3588         u32 this_item_size;
3589
3590         btrfs_init_map_token(&token);
3591
3592         if (empty)
3593                 nr = 0;
3594         else
3595                 nr = max_t(u32, 1, min_slot);
3596
3597         if (path->slots[0] >= left_nritems)
3598                 push_space += data_size;
3599
3600         slot = path->slots[1];
3601         i = left_nritems - 1;
3602         while (i >= nr) {
3603                 item = btrfs_item_nr(i);
3604
3605                 if (!empty && push_items > 0) {
3606                         if (path->slots[0] > i)
3607                                 break;
3608                         if (path->slots[0] == i) {
3609                                 int space = btrfs_leaf_free_space(fs_info, left);
3610                                 if (space + push_space * 2 > free_space)
3611                                         break;
3612                         }
3613                 }
3614
3615                 if (path->slots[0] == i)
3616                         push_space += data_size;
3617
3618                 this_item_size = btrfs_item_size(left, item);
3619                 if (this_item_size + sizeof(*item) + push_space > free_space)
3620                         break;
3621
3622                 push_items++;
3623                 push_space += this_item_size + sizeof(*item);
3624                 if (i == 0)
3625                         break;
3626                 i--;
3627         }
3628
3629         if (push_items == 0)
3630                 goto out_unlock;
3631
3632         WARN_ON(!empty && push_items == left_nritems);
3633
3634         /* push left to right */
3635         right_nritems = btrfs_header_nritems(right);
3636
3637         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3638         push_space -= leaf_data_end(fs_info, left);
3639
3640         /* make room in the right data area */
3641         data_end = leaf_data_end(fs_info, right);
3642         memmove_extent_buffer(right,
3643                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3644                               BTRFS_LEAF_DATA_OFFSET + data_end,
3645                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3646
3647         /* copy from the left data area */
3648         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3649                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3650                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3651                      push_space);
3652
3653         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3654                               btrfs_item_nr_offset(0),
3655                               right_nritems * sizeof(struct btrfs_item));
3656
3657         /* copy the items from left to right */
3658         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3659                    btrfs_item_nr_offset(left_nritems - push_items),
3660                    push_items * sizeof(struct btrfs_item));
3661
3662         /* update the item pointers */
3663         right_nritems += push_items;
3664         btrfs_set_header_nritems(right, right_nritems);
3665         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3666         for (i = 0; i < right_nritems; i++) {
3667                 item = btrfs_item_nr(i);
3668                 push_space -= btrfs_token_item_size(right, item, &token);
3669                 btrfs_set_token_item_offset(right, item, push_space, &token);
3670         }
3671
3672         left_nritems -= push_items;
3673         btrfs_set_header_nritems(left, left_nritems);
3674
3675         if (left_nritems)
3676                 btrfs_mark_buffer_dirty(left);
3677         else
3678                 clean_tree_block(fs_info, left);
3679
3680         btrfs_mark_buffer_dirty(right);
3681
3682         btrfs_item_key(right, &disk_key, 0);
3683         btrfs_set_node_key(upper, &disk_key, slot + 1);
3684         btrfs_mark_buffer_dirty(upper);
3685
3686         /* then fixup the leaf pointer in the path */
3687         if (path->slots[0] >= left_nritems) {
3688                 path->slots[0] -= left_nritems;
3689                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3690                         clean_tree_block(fs_info, path->nodes[0]);
3691                 btrfs_tree_unlock(path->nodes[0]);
3692                 free_extent_buffer(path->nodes[0]);
3693                 path->nodes[0] = right;
3694                 path->slots[1] += 1;
3695         } else {
3696                 btrfs_tree_unlock(right);
3697                 free_extent_buffer(right);
3698         }
3699         return 0;
3700
3701 out_unlock:
3702         btrfs_tree_unlock(right);
3703         free_extent_buffer(right);
3704         return 1;
3705 }
3706
3707 /*
3708  * push some data in the path leaf to the right, trying to free up at
3709  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3710  *
3711  * returns 1 if the push failed because the other node didn't have enough
3712  * room, 0 if everything worked out and < 0 if there were major errors.
3713  *
3714  * this will push starting from min_slot to the end of the leaf.  It won't
3715  * push any slot lower than min_slot
3716  */
3717 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3718                            *root, struct btrfs_path *path,
3719                            int min_data_size, int data_size,
3720                            int empty, u32 min_slot)
3721 {
3722         struct btrfs_fs_info *fs_info = root->fs_info;
3723         struct extent_buffer *left = path->nodes[0];
3724         struct extent_buffer *right;
3725         struct extent_buffer *upper;
3726         int slot;
3727         int free_space;
3728         u32 left_nritems;
3729         int ret;
3730
3731         if (!path->nodes[1])
3732                 return 1;
3733
3734         slot = path->slots[1];
3735         upper = path->nodes[1];
3736         if (slot >= btrfs_header_nritems(upper) - 1)
3737                 return 1;
3738
3739         btrfs_assert_tree_locked(path->nodes[1]);
3740
3741         right = read_node_slot(fs_info, upper, slot + 1);
3742         /*
3743          * slot + 1 is not valid or we fail to read the right node,
3744          * no big deal, just return.
3745          */
3746         if (IS_ERR(right))
3747                 return 1;
3748
3749         btrfs_tree_lock(right);
3750         btrfs_set_lock_blocking(right);
3751
3752         free_space = btrfs_leaf_free_space(fs_info, right);
3753         if (free_space < data_size)
3754                 goto out_unlock;
3755
3756         /* cow and double check */
3757         ret = btrfs_cow_block(trans, root, right, upper,
3758                               slot + 1, &right);
3759         if (ret)
3760                 goto out_unlock;
3761
3762         free_space = btrfs_leaf_free_space(fs_info, right);
3763         if (free_space < data_size)
3764                 goto out_unlock;
3765
3766         left_nritems = btrfs_header_nritems(left);
3767         if (left_nritems == 0)
3768                 goto out_unlock;
3769
3770         if (path->slots[0] == left_nritems && !empty) {
3771                 /* Key greater than all keys in the leaf, right neighbor has
3772                  * enough room for it and we're not emptying our leaf to delete
3773                  * it, therefore use right neighbor to insert the new item and
3774                  * no need to touch/dirty our left leaft. */
3775                 btrfs_tree_unlock(left);
3776                 free_extent_buffer(left);
3777                 path->nodes[0] = right;
3778                 path->slots[0] = 0;
3779                 path->slots[1]++;
3780                 return 0;
3781         }
3782
3783         return __push_leaf_right(fs_info, path, min_data_size, empty,
3784                                 right, free_space, left_nritems, min_slot);
3785 out_unlock:
3786         btrfs_tree_unlock(right);
3787         free_extent_buffer(right);
3788         return 1;
3789 }
3790
3791 /*
3792  * push some data in the path leaf to the left, trying to free up at
3793  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3794  *
3795  * max_slot can put a limit on how far into the leaf we'll push items.  The
3796  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3797  * items
3798  */
3799 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3800                                      struct btrfs_path *path, int data_size,
3801                                      int empty, struct extent_buffer *left,
3802                                      int free_space, u32 right_nritems,
3803                                      u32 max_slot)
3804 {
3805         struct btrfs_disk_key disk_key;
3806         struct extent_buffer *right = path->nodes[0];
3807         int i;
3808         int push_space = 0;
3809         int push_items = 0;
3810         struct btrfs_item *item;
3811         u32 old_left_nritems;
3812         u32 nr;
3813         int ret = 0;
3814         u32 this_item_size;
3815         u32 old_left_item_size;
3816         struct btrfs_map_token token;
3817
3818         btrfs_init_map_token(&token);
3819
3820         if (empty)
3821                 nr = min(right_nritems, max_slot);
3822         else
3823                 nr = min(right_nritems - 1, max_slot);
3824
3825         for (i = 0; i < nr; i++) {
3826                 item = btrfs_item_nr(i);
3827
3828                 if (!empty && push_items > 0) {
3829                         if (path->slots[0] < i)
3830                                 break;
3831                         if (path->slots[0] == i) {
3832                                 int space = btrfs_leaf_free_space(fs_info, right);
3833                                 if (space + push_space * 2 > free_space)
3834                                         break;
3835                         }
3836                 }
3837
3838                 if (path->slots[0] == i)
3839                         push_space += data_size;
3840
3841                 this_item_size = btrfs_item_size(right, item);
3842                 if (this_item_size + sizeof(*item) + push_space > free_space)
3843                         break;
3844
3845                 push_items++;
3846                 push_space += this_item_size + sizeof(*item);
3847         }
3848
3849         if (push_items == 0) {
3850                 ret = 1;
3851                 goto out;
3852         }
3853         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3854
3855         /* push data from right to left */
3856         copy_extent_buffer(left, right,
3857                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3858                            btrfs_item_nr_offset(0),
3859                            push_items * sizeof(struct btrfs_item));
3860
3861         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3862                      btrfs_item_offset_nr(right, push_items - 1);
3863
3864         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3865                      leaf_data_end(fs_info, left) - push_space,
3866                      BTRFS_LEAF_DATA_OFFSET +
3867                      btrfs_item_offset_nr(right, push_items - 1),
3868                      push_space);
3869         old_left_nritems = btrfs_header_nritems(left);
3870         BUG_ON(old_left_nritems <= 0);
3871
3872         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3873         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3874                 u32 ioff;
3875
3876                 item = btrfs_item_nr(i);
3877
3878                 ioff = btrfs_token_item_offset(left, item, &token);
3879                 btrfs_set_token_item_offset(left, item,
3880                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3881                       &token);
3882         }
3883         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3884
3885         /* fixup right node */
3886         if (push_items > right_nritems)
3887                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3888                        right_nritems);
3889
3890         if (push_items < right_nritems) {
3891                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3892                                                   leaf_data_end(fs_info, right);
3893                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3894                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3895                                       BTRFS_LEAF_DATA_OFFSET +
3896                                       leaf_data_end(fs_info, right), push_space);
3897
3898                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3899                               btrfs_item_nr_offset(push_items),
3900                              (btrfs_header_nritems(right) - push_items) *
3901                              sizeof(struct btrfs_item));
3902         }
3903         right_nritems -= push_items;
3904         btrfs_set_header_nritems(right, right_nritems);
3905         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3906         for (i = 0; i < right_nritems; i++) {
3907                 item = btrfs_item_nr(i);
3908
3909                 push_space = push_space - btrfs_token_item_size(right,
3910                                                                 item, &token);
3911                 btrfs_set_token_item_offset(right, item, push_space, &token);
3912         }
3913
3914         btrfs_mark_buffer_dirty(left);
3915         if (right_nritems)
3916                 btrfs_mark_buffer_dirty(right);
3917         else
3918                 clean_tree_block(fs_info, right);
3919
3920         btrfs_item_key(right, &disk_key, 0);
3921         fixup_low_keys(fs_info, path, &disk_key, 1);
3922
3923         /* then fixup the leaf pointer in the path */
3924         if (path->slots[0] < push_items) {
3925                 path->slots[0] += old_left_nritems;
3926                 btrfs_tree_unlock(path->nodes[0]);
3927                 free_extent_buffer(path->nodes[0]);
3928                 path->nodes[0] = left;
3929                 path->slots[1] -= 1;
3930         } else {
3931                 btrfs_tree_unlock(left);
3932                 free_extent_buffer(left);
3933                 path->slots[0] -= push_items;
3934         }
3935         BUG_ON(path->slots[0] < 0);
3936         return ret;
3937 out:
3938         btrfs_tree_unlock(left);
3939         free_extent_buffer(left);
3940         return ret;
3941 }
3942
3943 /*
3944  * push some data in the path leaf to the left, trying to free up at
3945  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3946  *
3947  * max_slot can put a limit on how far into the leaf we'll push items.  The
3948  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3949  * items
3950  */
3951 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3952                           *root, struct btrfs_path *path, int min_data_size,
3953                           int data_size, int empty, u32 max_slot)
3954 {
3955         struct btrfs_fs_info *fs_info = root->fs_info;
3956         struct extent_buffer *right = path->nodes[0];
3957         struct extent_buffer *left;
3958         int slot;
3959         int free_space;
3960         u32 right_nritems;
3961         int ret = 0;
3962
3963         slot = path->slots[1];
3964         if (slot == 0)
3965                 return 1;
3966         if (!path->nodes[1])
3967                 return 1;
3968
3969         right_nritems = btrfs_header_nritems(right);
3970         if (right_nritems == 0)
3971                 return 1;
3972
3973         btrfs_assert_tree_locked(path->nodes[1]);
3974
3975         left = read_node_slot(fs_info, path->nodes[1], slot - 1);
3976         /*
3977          * slot - 1 is not valid or we fail to read the left node,
3978          * no big deal, just return.
3979          */
3980         if (IS_ERR(left))
3981                 return 1;
3982
3983         btrfs_tree_lock(left);
3984         btrfs_set_lock_blocking(left);
3985
3986         free_space = btrfs_leaf_free_space(fs_info, left);
3987         if (free_space < data_size) {
3988                 ret = 1;
3989                 goto out;
3990         }
3991
3992         /* cow and double check */
3993         ret = btrfs_cow_block(trans, root, left,
3994                               path->nodes[1], slot - 1, &left);
3995         if (ret) {
3996                 /* we hit -ENOSPC, but it isn't fatal here */
3997                 if (ret == -ENOSPC)
3998                         ret = 1;
3999                 goto out;
4000         }
4001
4002         free_space = btrfs_leaf_free_space(fs_info, left);
4003         if (free_space < data_size) {
4004                 ret = 1;
4005                 goto out;
4006         }
4007
4008         return __push_leaf_left(fs_info, path, min_data_size,
4009                                empty, left, free_space, right_nritems,
4010                                max_slot);
4011 out:
4012         btrfs_tree_unlock(left);
4013         free_extent_buffer(left);
4014         return ret;
4015 }
4016
4017 /*
4018  * split the path's leaf in two, making sure there is at least data_size
4019  * available for the resulting leaf level of the path.
4020  */
4021 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4022                                     struct btrfs_fs_info *fs_info,
4023                                     struct btrfs_path *path,
4024                                     struct extent_buffer *l,
4025                                     struct extent_buffer *right,
4026                                     int slot, int mid, int nritems)
4027 {
4028         int data_copy_size;
4029         int rt_data_off;
4030         int i;
4031         struct btrfs_disk_key disk_key;
4032         struct btrfs_map_token token;
4033
4034         btrfs_init_map_token(&token);
4035
4036         nritems = nritems - mid;
4037         btrfs_set_header_nritems(right, nritems);
4038         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4039
4040         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4041                            btrfs_item_nr_offset(mid),
4042                            nritems * sizeof(struct btrfs_item));
4043
4044         copy_extent_buffer(right, l,
4045                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4046                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4047                      leaf_data_end(fs_info, l), data_copy_size);
4048
4049         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4050
4051         for (i = 0; i < nritems; i++) {
4052                 struct btrfs_item *item = btrfs_item_nr(i);
4053                 u32 ioff;
4054
4055                 ioff = btrfs_token_item_offset(right, item, &token);
4056                 btrfs_set_token_item_offset(right, item,
4057                                             ioff + rt_data_off, &token);
4058         }
4059
4060         btrfs_set_header_nritems(l, mid);
4061         btrfs_item_key(right, &disk_key, 0);
4062         insert_ptr(trans, fs_info, path, &disk_key, right->start,
4063                    path->slots[1] + 1, 1);
4064
4065         btrfs_mark_buffer_dirty(right);
4066         btrfs_mark_buffer_dirty(l);
4067         BUG_ON(path->slots[0] != slot);
4068
4069         if (mid <= slot) {
4070                 btrfs_tree_unlock(path->nodes[0]);
4071                 free_extent_buffer(path->nodes[0]);
4072                 path->nodes[0] = right;
4073                 path->slots[0] -= mid;
4074                 path->slots[1] += 1;
4075         } else {
4076                 btrfs_tree_unlock(right);
4077                 free_extent_buffer(right);
4078         }
4079
4080         BUG_ON(path->slots[0] < 0);
4081 }
4082
4083 /*
4084  * double splits happen when we need to insert a big item in the middle
4085  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4086  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4087  *          A                 B                 C
4088  *
4089  * We avoid this by trying to push the items on either side of our target
4090  * into the adjacent leaves.  If all goes well we can avoid the double split
4091  * completely.
4092  */
4093 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4094                                           struct btrfs_root *root,
4095                                           struct btrfs_path *path,
4096                                           int data_size)
4097 {
4098         struct btrfs_fs_info *fs_info = root->fs_info;
4099         int ret;
4100         int progress = 0;
4101         int slot;
4102         u32 nritems;
4103         int space_needed = data_size;
4104
4105         slot = path->slots[0];
4106         if (slot < btrfs_header_nritems(path->nodes[0]))
4107                 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4108
4109         /*
4110          * try to push all the items after our slot into the
4111          * right leaf
4112          */
4113         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4114         if (ret < 0)
4115                 return ret;
4116
4117         if (ret == 0)
4118                 progress++;
4119
4120         nritems = btrfs_header_nritems(path->nodes[0]);
4121         /*
4122          * our goal is to get our slot at the start or end of a leaf.  If
4123          * we've done so we're done
4124          */
4125         if (path->slots[0] == 0 || path->slots[0] == nritems)
4126                 return 0;
4127
4128         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4129                 return 0;
4130
4131         /* try to push all the items before our slot into the next leaf */
4132         slot = path->slots[0];
4133         space_needed = data_size;
4134         if (slot > 0)
4135                 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4136         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4137         if (ret < 0)
4138                 return ret;
4139
4140         if (ret == 0)
4141                 progress++;
4142
4143         if (progress)
4144                 return 0;
4145         return 1;
4146 }
4147
4148 /*
4149  * split the path's leaf in two, making sure there is at least data_size
4150  * available for the resulting leaf level of the path.
4151  *
4152  * returns 0 if all went well and < 0 on failure.
4153  */
4154 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4155                                struct btrfs_root *root,
4156                                const struct btrfs_key *ins_key,
4157                                struct btrfs_path *path, int data_size,
4158                                int extend)
4159 {
4160         struct btrfs_disk_key disk_key;
4161         struct extent_buffer *l;
4162         u32 nritems;
4163         int mid;
4164         int slot;
4165         struct extent_buffer *right;
4166         struct btrfs_fs_info *fs_info = root->fs_info;
4167         int ret = 0;
4168         int wret;
4169         int split;
4170         int num_doubles = 0;
4171         int tried_avoid_double = 0;
4172
4173         l = path->nodes[0];
4174         slot = path->slots[0];
4175         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4176             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4177                 return -EOVERFLOW;
4178
4179         /* first try to make some room by pushing left and right */
4180         if (data_size && path->nodes[1]) {
4181                 int space_needed = data_size;
4182
4183                 if (slot < btrfs_header_nritems(l))
4184                         space_needed -= btrfs_leaf_free_space(fs_info, l);
4185
4186                 wret = push_leaf_right(trans, root, path, space_needed,
4187                                        space_needed, 0, 0);
4188                 if (wret < 0)
4189                         return wret;
4190                 if (wret) {
4191                         space_needed = data_size;
4192                         if (slot > 0)
4193                                 space_needed -= btrfs_leaf_free_space(fs_info,
4194                                                                       l);
4195                         wret = push_leaf_left(trans, root, path, space_needed,
4196                                               space_needed, 0, (u32)-1);
4197                         if (wret < 0)
4198                                 return wret;
4199                 }
4200                 l = path->nodes[0];
4201
4202                 /* did the pushes work? */
4203                 if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4204                         return 0;
4205         }
4206
4207         if (!path->nodes[1]) {
4208                 ret = insert_new_root(trans, root, path, 1);
4209                 if (ret)
4210                         return ret;
4211         }
4212 again:
4213         split = 1;
4214         l = path->nodes[0];
4215         slot = path->slots[0];
4216         nritems = btrfs_header_nritems(l);
4217         mid = (nritems + 1) / 2;
4218
4219         if (mid <= slot) {
4220                 if (nritems == 1 ||
4221                     leaf_space_used(l, mid, nritems - mid) + data_size >
4222                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4223                         if (slot >= nritems) {
4224                                 split = 0;
4225                         } else {
4226                                 mid = slot;
4227                                 if (mid != nritems &&
4228                                     leaf_space_used(l, mid, nritems - mid) +
4229                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4230                                         if (data_size && !tried_avoid_double)
4231                                                 goto push_for_double;
4232                                         split = 2;
4233                                 }
4234                         }
4235                 }
4236         } else {
4237                 if (leaf_space_used(l, 0, mid) + data_size >
4238                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4239                         if (!extend && data_size && slot == 0) {
4240                                 split = 0;
4241                         } else if ((extend || !data_size) && slot == 0) {
4242                                 mid = 1;
4243                         } else {
4244                                 mid = slot;
4245                                 if (mid != nritems &&
4246                                     leaf_space_used(l, mid, nritems - mid) +
4247                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4248                                         if (data_size && !tried_avoid_double)
4249                                                 goto push_for_double;
4250                                         split = 2;
4251                                 }
4252                         }
4253                 }
4254         }
4255
4256         if (split == 0)
4257                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4258         else
4259                 btrfs_item_key(l, &disk_key, mid);
4260
4261         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4262                         &disk_key, 0, l->start, 0);
4263         if (IS_ERR(right))
4264                 return PTR_ERR(right);
4265
4266         root_add_used(root, fs_info->nodesize);
4267
4268         memzero_extent_buffer(right, 0, sizeof(struct btrfs_header));
4269         btrfs_set_header_bytenr(right, right->start);
4270         btrfs_set_header_generation(right, trans->transid);
4271         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4272         btrfs_set_header_owner(right, root->root_key.objectid);
4273         btrfs_set_header_level(right, 0);
4274         write_extent_buffer_fsid(right, fs_info->fsid);
4275         write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid);
4276
4277         if (split == 0) {
4278                 if (mid <= slot) {
4279                         btrfs_set_header_nritems(right, 0);
4280                         insert_ptr(trans, fs_info, path, &disk_key,
4281                                    right->start, path->slots[1] + 1, 1);
4282                         btrfs_tree_unlock(path->nodes[0]);
4283                         free_extent_buffer(path->nodes[0]);
4284                         path->nodes[0] = right;
4285                         path->slots[0] = 0;
4286                         path->slots[1] += 1;
4287                 } else {
4288                         btrfs_set_header_nritems(right, 0);
4289                         insert_ptr(trans, fs_info, path, &disk_key,
4290                                    right->start, path->slots[1], 1);
4291                         btrfs_tree_unlock(path->nodes[0]);
4292                         free_extent_buffer(path->nodes[0]);
4293                         path->nodes[0] = right;
4294                         path->slots[0] = 0;
4295                         if (path->slots[1] == 0)
4296                                 fixup_low_keys(fs_info, path, &disk_key, 1);
4297                 }
4298                 /*
4299                  * We create a new leaf 'right' for the required ins_len and
4300                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4301                  * the content of ins_len to 'right'.
4302                  */
4303                 return ret;
4304         }
4305
4306         copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4307
4308         if (split == 2) {
4309                 BUG_ON(num_doubles != 0);
4310                 num_doubles++;
4311                 goto again;
4312         }
4313
4314         return 0;
4315
4316 push_for_double:
4317         push_for_double_split(trans, root, path, data_size);
4318         tried_avoid_double = 1;
4319         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4320                 return 0;
4321         goto again;
4322 }
4323
4324 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4325                                          struct btrfs_root *root,
4326                                          struct btrfs_path *path, int ins_len)
4327 {
4328         struct btrfs_fs_info *fs_info = root->fs_info;
4329         struct btrfs_key key;
4330         struct extent_buffer *leaf;
4331         struct btrfs_file_extent_item *fi;
4332         u64 extent_len = 0;
4333         u32 item_size;
4334         int ret;
4335
4336         leaf = path->nodes[0];
4337         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4338
4339         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4340                key.type != BTRFS_EXTENT_CSUM_KEY);
4341
4342         if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4343                 return 0;
4344
4345         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4346         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4347                 fi = btrfs_item_ptr(leaf, path->slots[0],
4348                                     struct btrfs_file_extent_item);
4349                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4350         }
4351         btrfs_release_path(path);
4352
4353         path->keep_locks = 1;
4354         path->search_for_split = 1;
4355         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4356         path->search_for_split = 0;
4357         if (ret > 0)
4358                 ret = -EAGAIN;
4359         if (ret < 0)
4360                 goto err;
4361
4362         ret = -EAGAIN;
4363         leaf = path->nodes[0];
4364         /* if our item isn't there, return now */
4365         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4366                 goto err;
4367
4368         /* the leaf has  changed, it now has room.  return now */
4369         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4370                 goto err;
4371
4372         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4373                 fi = btrfs_item_ptr(leaf, path->slots[0],
4374                                     struct btrfs_file_extent_item);
4375                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4376                         goto err;
4377         }
4378
4379         btrfs_set_path_blocking(path);
4380         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4381         if (ret)
4382                 goto err;
4383
4384         path->keep_locks = 0;
4385         btrfs_unlock_up_safe(path, 1);
4386         return 0;
4387 err:
4388         path->keep_locks = 0;
4389         return ret;
4390 }
4391
4392 static noinline int split_item(struct btrfs_fs_info *fs_info,
4393                                struct btrfs_path *path,
4394                                const struct btrfs_key *new_key,
4395                                unsigned long split_offset)
4396 {
4397         struct extent_buffer *leaf;
4398         struct btrfs_item *item;
4399         struct btrfs_item *new_item;
4400         int slot;
4401         char *buf;
4402         u32 nritems;
4403         u32 item_size;
4404         u32 orig_offset;
4405         struct btrfs_disk_key disk_key;
4406
4407         leaf = path->nodes[0];
4408         BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4409
4410         btrfs_set_path_blocking(path);
4411
4412         item = btrfs_item_nr(path->slots[0]);
4413         orig_offset = btrfs_item_offset(leaf, item);
4414         item_size = btrfs_item_size(leaf, item);
4415
4416         buf = kmalloc(item_size, GFP_NOFS);
4417         if (!buf)
4418                 return -ENOMEM;
4419
4420         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4421                             path->slots[0]), item_size);
4422
4423         slot = path->slots[0] + 1;
4424         nritems = btrfs_header_nritems(leaf);
4425         if (slot != nritems) {
4426                 /* shift the items */
4427                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4428                                 btrfs_item_nr_offset(slot),
4429                                 (nritems - slot) * sizeof(struct btrfs_item));
4430         }
4431
4432         btrfs_cpu_key_to_disk(&disk_key, new_key);
4433         btrfs_set_item_key(leaf, &disk_key, slot);
4434
4435         new_item = btrfs_item_nr(slot);
4436
4437         btrfs_set_item_offset(leaf, new_item, orig_offset);
4438         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4439
4440         btrfs_set_item_offset(leaf, item,
4441                               orig_offset + item_size - split_offset);
4442         btrfs_set_item_size(leaf, item, split_offset);
4443
4444         btrfs_set_header_nritems(leaf, nritems + 1);
4445
4446         /* write the data for the start of the original item */
4447         write_extent_buffer(leaf, buf,
4448                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4449                             split_offset);
4450
4451         /* write the data for the new item */
4452         write_extent_buffer(leaf, buf + split_offset,
4453                             btrfs_item_ptr_offset(leaf, slot),
4454                             item_size - split_offset);
4455         btrfs_mark_buffer_dirty(leaf);
4456
4457         BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4458         kfree(buf);
4459         return 0;
4460 }
4461
4462 /*
4463  * This function splits a single item into two items,
4464  * giving 'new_key' to the new item and splitting the
4465  * old one at split_offset (from the start of the item).
4466  *
4467  * The path may be released by this operation.  After
4468  * the split, the path is pointing to the old item.  The
4469  * new item is going to be in the same node as the old one.
4470  *
4471  * Note, the item being split must be smaller enough to live alone on
4472  * a tree block with room for one extra struct btrfs_item
4473  *
4474  * This allows us to split the item in place, keeping a lock on the
4475  * leaf the entire time.
4476  */
4477 int btrfs_split_item(struct btrfs_trans_handle *trans,
4478                      struct btrfs_root *root,
4479                      struct btrfs_path *path,
4480                      const struct btrfs_key *new_key,
4481                      unsigned long split_offset)
4482 {
4483         int ret;
4484         ret = setup_leaf_for_split(trans, root, path,
4485                                    sizeof(struct btrfs_item));
4486         if (ret)
4487                 return ret;
4488
4489         ret = split_item(root->fs_info, path, new_key, split_offset);
4490         return ret;
4491 }
4492
4493 /*
4494  * This function duplicate a item, giving 'new_key' to the new item.
4495  * It guarantees both items live in the same tree leaf and the new item
4496  * is contiguous with the original item.
4497  *
4498  * This allows us to split file extent in place, keeping a lock on the
4499  * leaf the entire time.
4500  */
4501 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4502                          struct btrfs_root *root,
4503                          struct btrfs_path *path,
4504                          const struct btrfs_key *new_key)
4505 {
4506         struct extent_buffer *leaf;
4507         int ret;
4508         u32 item_size;
4509
4510         leaf = path->nodes[0];
4511         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4512         ret = setup_leaf_for_split(trans, root, path,
4513                                    item_size + sizeof(struct btrfs_item));
4514         if (ret)
4515                 return ret;
4516
4517         path->slots[0]++;
4518         setup_items_for_insert(root, path, new_key, &item_size,
4519                                item_size, item_size +
4520                                sizeof(struct btrfs_item), 1);
4521         leaf = path->nodes[0];
4522         memcpy_extent_buffer(leaf,
4523                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4524                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4525                              item_size);
4526         return 0;
4527 }
4528
4529 /*
4530  * make the item pointed to by the path smaller.  new_size indicates
4531  * how small to make it, and from_end tells us if we just chop bytes
4532  * off the end of the item or if we shift the item to chop bytes off
4533  * the front.
4534  */
4535 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4536                          struct btrfs_path *path, u32 new_size, int from_end)
4537 {
4538         int slot;
4539         struct extent_buffer *leaf;
4540         struct btrfs_item *item;
4541         u32 nritems;
4542         unsigned int data_end;
4543         unsigned int old_data_start;
4544         unsigned int old_size;
4545         unsigned int size_diff;
4546         int i;
4547         struct btrfs_map_token token;
4548
4549         btrfs_init_map_token(&token);
4550
4551         leaf = path->nodes[0];
4552         slot = path->slots[0];
4553
4554         old_size = btrfs_item_size_nr(leaf, slot);
4555         if (old_size == new_size)
4556                 return;
4557
4558         nritems = btrfs_header_nritems(leaf);
4559         data_end = leaf_data_end(fs_info, leaf);
4560
4561         old_data_start = btrfs_item_offset_nr(leaf, slot);
4562
4563         size_diff = old_size - new_size;
4564
4565         BUG_ON(slot < 0);
4566         BUG_ON(slot >= nritems);
4567
4568         /*
4569          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4570          */
4571         /* first correct the data pointers */
4572         for (i = slot; i < nritems; i++) {
4573                 u32 ioff;
4574                 item = btrfs_item_nr(i);
4575
4576                 ioff = btrfs_token_item_offset(leaf, item, &token);
4577                 btrfs_set_token_item_offset(leaf, item,
4578                                             ioff + size_diff, &token);
4579         }
4580
4581         /* shift the data */
4582         if (from_end) {
4583                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4584                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4585                               data_end, old_data_start + new_size - data_end);
4586         } else {
4587                 struct btrfs_disk_key disk_key;
4588                 u64 offset;
4589
4590                 btrfs_item_key(leaf, &disk_key, slot);
4591
4592                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4593                         unsigned long ptr;
4594                         struct btrfs_file_extent_item *fi;
4595
4596                         fi = btrfs_item_ptr(leaf, slot,
4597                                             struct btrfs_file_extent_item);
4598                         fi = (struct btrfs_file_extent_item *)(
4599                              (unsigned long)fi - size_diff);
4600
4601                         if (btrfs_file_extent_type(leaf, fi) ==
4602                             BTRFS_FILE_EXTENT_INLINE) {
4603                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4604                                 memmove_extent_buffer(leaf, ptr,
4605                                       (unsigned long)fi,
4606                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4607                         }
4608                 }
4609
4610                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4611                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4612                               data_end, old_data_start - data_end);
4613
4614                 offset = btrfs_disk_key_offset(&disk_key);
4615                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4616                 btrfs_set_item_key(leaf, &disk_key, slot);
4617                 if (slot == 0)
4618                         fixup_low_keys(fs_info, path, &disk_key, 1);
4619         }
4620
4621         item = btrfs_item_nr(slot);
4622         btrfs_set_item_size(leaf, item, new_size);
4623         btrfs_mark_buffer_dirty(leaf);
4624
4625         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4626                 btrfs_print_leaf(leaf);
4627                 BUG();
4628         }
4629 }
4630
4631 /*
4632  * make the item pointed to by the path bigger, data_size is the added size.
4633  */
4634 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4635                        u32 data_size)
4636 {
4637         int slot;
4638         struct extent_buffer *leaf;
4639         struct btrfs_item *item;
4640         u32 nritems;
4641         unsigned int data_end;
4642         unsigned int old_data;
4643         unsigned int old_size;
4644         int i;
4645         struct btrfs_map_token token;
4646
4647         btrfs_init_map_token(&token);
4648
4649         leaf = path->nodes[0];
4650
4651         nritems = btrfs_header_nritems(leaf);
4652         data_end = leaf_data_end(fs_info, leaf);
4653
4654         if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4655                 btrfs_print_leaf(leaf);
4656                 BUG();
4657         }
4658         slot = path->slots[0];
4659         old_data = btrfs_item_end_nr(leaf, slot);
4660
4661         BUG_ON(slot < 0);
4662         if (slot >= nritems) {
4663                 btrfs_print_leaf(leaf);
4664                 btrfs_crit(fs_info, "slot %d too large, nritems %d",
4665                            slot, nritems);
4666                 BUG_ON(1);
4667         }
4668
4669         /*
4670          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4671          */
4672         /* first correct the data pointers */
4673         for (i = slot; i < nritems; i++) {
4674                 u32 ioff;
4675                 item = btrfs_item_nr(i);
4676
4677                 ioff = btrfs_token_item_offset(leaf, item, &token);
4678                 btrfs_set_token_item_offset(leaf, item,
4679                                             ioff - data_size, &token);
4680         }
4681
4682         /* shift the data */
4683         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4684                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4685                       data_end, old_data - data_end);
4686
4687         data_end = old_data;
4688         old_size = btrfs_item_size_nr(leaf, slot);
4689         item = btrfs_item_nr(slot);
4690         btrfs_set_item_size(leaf, item, old_size + data_size);
4691         btrfs_mark_buffer_dirty(leaf);
4692
4693         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4694                 btrfs_print_leaf(leaf);
4695                 BUG();
4696         }
4697 }
4698
4699 /*
4700  * this is a helper for btrfs_insert_empty_items, the main goal here is
4701  * to save stack depth by doing the bulk of the work in a function
4702  * that doesn't call btrfs_search_slot
4703  */
4704 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4705                             const struct btrfs_key *cpu_key, u32 *data_size,
4706                             u32 total_data, u32 total_size, int nr)
4707 {
4708         struct btrfs_fs_info *fs_info = root->fs_info;
4709         struct btrfs_item *item;
4710         int i;
4711         u32 nritems;
4712         unsigned int data_end;
4713         struct btrfs_disk_key disk_key;
4714         struct extent_buffer *leaf;
4715         int slot;
4716         struct btrfs_map_token token;
4717
4718         if (path->slots[0] == 0) {
4719                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4720                 fixup_low_keys(fs_info, path, &disk_key, 1);
4721         }
4722         btrfs_unlock_up_safe(path, 1);
4723
4724         btrfs_init_map_token(&token);
4725
4726         leaf = path->nodes[0];
4727         slot = path->slots[0];
4728
4729         nritems = btrfs_header_nritems(leaf);
4730         data_end = leaf_data_end(fs_info, leaf);
4731
4732         if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4733                 btrfs_print_leaf(leaf);
4734                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4735                            total_size, btrfs_leaf_free_space(fs_info, leaf));
4736                 BUG();
4737         }
4738
4739         if (slot != nritems) {
4740                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4741
4742                 if (old_data < data_end) {
4743                         btrfs_print_leaf(leaf);
4744                         btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4745                                    slot, old_data, data_end);
4746                         BUG_ON(1);
4747                 }
4748                 /*
4749                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4750                  */
4751                 /* first correct the data pointers */
4752                 for (i = slot; i < nritems; i++) {
4753                         u32 ioff;
4754
4755                         item = btrfs_item_nr(i);
4756                         ioff = btrfs_token_item_offset(leaf, item, &token);
4757                         btrfs_set_token_item_offset(leaf, item,
4758                                                     ioff - total_data, &token);
4759                 }
4760                 /* shift the items */
4761                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4762                               btrfs_item_nr_offset(slot),
4763                               (nritems - slot) * sizeof(struct btrfs_item));
4764
4765                 /* shift the data */
4766                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4767                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4768                               data_end, old_data - data_end);
4769                 data_end = old_data;
4770         }
4771
4772         /* setup the item for the new data */
4773         for (i = 0; i < nr; i++) {
4774                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4775                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4776                 item = btrfs_item_nr(slot + i);
4777                 btrfs_set_token_item_offset(leaf, item,
4778                                             data_end - data_size[i], &token);
4779                 data_end -= data_size[i];
4780                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4781         }
4782
4783         btrfs_set_header_nritems(leaf, nritems + nr);
4784         btrfs_mark_buffer_dirty(leaf);
4785
4786         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4787                 btrfs_print_leaf(leaf);
4788                 BUG();
4789         }
4790 }
4791
4792 /*
4793  * Given a key and some data, insert items into the tree.
4794  * This does all the path init required, making room in the tree if needed.
4795  */
4796 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4797                             struct btrfs_root *root,
4798                             struct btrfs_path *path,
4799                             const struct btrfs_key *cpu_key, u32 *data_size,
4800                             int nr)
4801 {
4802         int ret = 0;
4803         int slot;
4804         int i;
4805         u32 total_size = 0;
4806         u32 total_data = 0;
4807
4808         for (i = 0; i < nr; i++)
4809                 total_data += data_size[i];
4810
4811         total_size = total_data + (nr * sizeof(struct btrfs_item));
4812         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4813         if (ret == 0)
4814                 return -EEXIST;
4815         if (ret < 0)
4816                 return ret;
4817
4818         slot = path->slots[0];
4819         BUG_ON(slot < 0);
4820
4821         setup_items_for_insert(root, path, cpu_key, data_size,
4822                                total_data, total_size, nr);
4823         return 0;
4824 }
4825
4826 /*
4827  * Given a key and some data, insert an item into the tree.
4828  * This does all the path init required, making room in the tree if needed.
4829  */
4830 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4831                       const struct btrfs_key *cpu_key, void *data,
4832                       u32 data_size)
4833 {
4834         int ret = 0;
4835         struct btrfs_path *path;
4836         struct extent_buffer *leaf;
4837         unsigned long ptr;
4838
4839         path = btrfs_alloc_path();
4840         if (!path)
4841                 return -ENOMEM;
4842         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4843         if (!ret) {
4844                 leaf = path->nodes[0];
4845                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4846                 write_extent_buffer(leaf, data, ptr, data_size);
4847                 btrfs_mark_buffer_dirty(leaf);
4848         }
4849         btrfs_free_path(path);
4850         return ret;
4851 }
4852
4853 /*
4854  * delete the pointer from a given node.
4855  *
4856  * the tree should have been previously balanced so the deletion does not
4857  * empty a node.
4858  */
4859 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4860                     int level, int slot)
4861 {
4862         struct btrfs_fs_info *fs_info = root->fs_info;
4863         struct extent_buffer *parent = path->nodes[level];
4864         u32 nritems;
4865         int ret;
4866
4867         nritems = btrfs_header_nritems(parent);
4868         if (slot != nritems - 1) {
4869                 if (level) {
4870                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4871                                         nritems - slot - 1);
4872                         BUG_ON(ret < 0);
4873                 }
4874                 memmove_extent_buffer(parent,
4875                               btrfs_node_key_ptr_offset(slot),
4876                               btrfs_node_key_ptr_offset(slot + 1),
4877                               sizeof(struct btrfs_key_ptr) *
4878                               (nritems - slot - 1));
4879         } else if (level) {
4880                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4881                                 GFP_NOFS);
4882                 BUG_ON(ret < 0);
4883         }
4884
4885         nritems--;
4886         btrfs_set_header_nritems(parent, nritems);
4887         if (nritems == 0 && parent == root->node) {
4888                 BUG_ON(btrfs_header_level(root->node) != 1);
4889                 /* just turn the root into a leaf and break */
4890                 btrfs_set_header_level(root->node, 0);
4891         } else if (slot == 0) {
4892                 struct btrfs_disk_key disk_key;
4893
4894                 btrfs_node_key(parent, &disk_key, 0);
4895                 fixup_low_keys(fs_info, path, &disk_key, level + 1);
4896         }
4897         btrfs_mark_buffer_dirty(parent);
4898 }
4899
4900 /*
4901  * a helper function to delete the leaf pointed to by path->slots[1] and
4902  * path->nodes[1].
4903  *
4904  * This deletes the pointer in path->nodes[1] and frees the leaf
4905  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4906  *
4907  * The path must have already been setup for deleting the leaf, including
4908  * all the proper balancing.  path->nodes[1] must be locked.
4909  */
4910 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4911                                     struct btrfs_root *root,
4912                                     struct btrfs_path *path,
4913                                     struct extent_buffer *leaf)
4914 {
4915         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4916         del_ptr(root, path, 1, path->slots[1]);
4917
4918         /*
4919          * btrfs_free_extent is expensive, we want to make sure we
4920          * aren't holding any locks when we call it
4921          */
4922         btrfs_unlock_up_safe(path, 0);
4923
4924         root_sub_used(root, leaf->len);
4925
4926         extent_buffer_get(leaf);
4927         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4928         free_extent_buffer_stale(leaf);
4929 }
4930 /*
4931  * delete the item at the leaf level in path.  If that empties
4932  * the leaf, remove it from the tree
4933  */
4934 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4935                     struct btrfs_path *path, int slot, int nr)
4936 {
4937         struct btrfs_fs_info *fs_info = root->fs_info;
4938         struct extent_buffer *leaf;
4939         struct btrfs_item *item;
4940         u32 last_off;
4941         u32 dsize = 0;
4942         int ret = 0;
4943         int wret;
4944         int i;
4945         u32 nritems;
4946         struct btrfs_map_token token;
4947
4948         btrfs_init_map_token(&token);
4949
4950         leaf = path->nodes[0];
4951         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4952
4953         for (i = 0; i < nr; i++)
4954                 dsize += btrfs_item_size_nr(leaf, slot + i);
4955
4956         nritems = btrfs_header_nritems(leaf);
4957
4958         if (slot + nr != nritems) {
4959                 int data_end = leaf_data_end(fs_info, leaf);
4960
4961                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4962                               data_end + dsize,
4963                               BTRFS_LEAF_DATA_OFFSET + data_end,
4964                               last_off - data_end);
4965
4966                 for (i = slot + nr; i < nritems; i++) {
4967                         u32 ioff;
4968
4969                         item = btrfs_item_nr(i);
4970                         ioff = btrfs_token_item_offset(leaf, item, &token);
4971                         btrfs_set_token_item_offset(leaf, item,
4972                                                     ioff + dsize, &token);
4973                 }
4974
4975                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4976                               btrfs_item_nr_offset(slot + nr),
4977                               sizeof(struct btrfs_item) *
4978                               (nritems - slot - nr));
4979         }
4980         btrfs_set_header_nritems(leaf, nritems - nr);
4981         nritems -= nr;
4982
4983         /* delete the leaf if we've emptied it */
4984         if (nritems == 0) {
4985                 if (leaf == root->node) {
4986                         btrfs_set_header_level(leaf, 0);
4987                 } else {
4988                         btrfs_set_path_blocking(path);
4989                         clean_tree_block(fs_info, leaf);
4990                         btrfs_del_leaf(trans, root, path, leaf);
4991                 }
4992         } else {
4993                 int used = leaf_space_used(leaf, 0, nritems);
4994                 if (slot == 0) {
4995                         struct btrfs_disk_key disk_key;
4996
4997                         btrfs_item_key(leaf, &disk_key, 0);
4998                         fixup_low_keys(fs_info, path, &disk_key, 1);
4999                 }
5000
5001                 /* delete the leaf if it is mostly empty */
5002                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5003                         /* push_leaf_left fixes the path.
5004                          * make sure the path still points to our leaf
5005                          * for possible call to del_ptr below
5006                          */
5007                         slot = path->slots[1];
5008                         extent_buffer_get(leaf);
5009
5010                         btrfs_set_path_blocking(path);
5011                         wret = push_leaf_left(trans, root, path, 1, 1,
5012                                               1, (u32)-1);
5013                         if (wret < 0 && wret != -ENOSPC)
5014                                 ret = wret;
5015
5016                         if (path->nodes[0] == leaf &&
5017                             btrfs_header_nritems(leaf)) {
5018                                 wret = push_leaf_right(trans, root, path, 1,
5019                                                        1, 1, 0);
5020                                 if (wret < 0 && wret != -ENOSPC)
5021                                         ret = wret;
5022                         }
5023
5024                         if (btrfs_header_nritems(leaf) == 0) {
5025                                 path->slots[1] = slot;
5026                                 btrfs_del_leaf(trans, root, path, leaf);
5027                                 free_extent_buffer(leaf);
5028                                 ret = 0;
5029                         } else {
5030                                 /* if we're still in the path, make sure
5031                                  * we're dirty.  Otherwise, one of the
5032                                  * push_leaf functions must have already
5033                                  * dirtied this buffer
5034                                  */
5035                                 if (path->nodes[0] == leaf)
5036                                         btrfs_mark_buffer_dirty(leaf);
5037                                 free_extent_buffer(leaf);
5038                         }
5039                 } else {
5040                         btrfs_mark_buffer_dirty(leaf);
5041                 }
5042         }
5043         return ret;
5044 }
5045
5046 /*
5047  * search the tree again to find a leaf with lesser keys
5048  * returns 0 if it found something or 1 if there are no lesser leaves.
5049  * returns < 0 on io errors.
5050  *
5051  * This may release the path, and so you may lose any locks held at the
5052  * time you call it.
5053  */
5054 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5055 {
5056         struct btrfs_key key;
5057         struct btrfs_disk_key found_key;
5058         int ret;
5059
5060         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5061
5062         if (key.offset > 0) {
5063                 key.offset--;
5064         } else if (key.type > 0) {
5065                 key.type--;
5066                 key.offset = (u64)-1;
5067         } else if (key.objectid > 0) {
5068                 key.objectid--;
5069                 key.type = (u8)-1;
5070                 key.offset = (u64)-1;
5071         } else {
5072                 return 1;
5073         }
5074
5075         btrfs_release_path(path);
5076         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5077         if (ret < 0)
5078                 return ret;
5079         btrfs_item_key(path->nodes[0], &found_key, 0);
5080         ret = comp_keys(&found_key, &key);
5081         /*
5082          * We might have had an item with the previous key in the tree right
5083          * before we released our path. And after we released our path, that
5084          * item might have been pushed to the first slot (0) of the leaf we
5085          * were holding due to a tree balance. Alternatively, an item with the
5086          * previous key can exist as the only element of a leaf (big fat item).
5087          * Therefore account for these 2 cases, so that our callers (like
5088          * btrfs_previous_item) don't miss an existing item with a key matching
5089          * the previous key we computed above.
5090          */
5091         if (ret <= 0)
5092                 return 0;
5093         return 1;
5094 }
5095
5096 /*
5097  * A helper function to walk down the tree starting at min_key, and looking
5098  * for nodes or leaves that are have a minimum transaction id.
5099  * This is used by the btree defrag code, and tree logging
5100  *
5101  * This does not cow, but it does stuff the starting key it finds back
5102  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5103  * key and get a writable path.
5104  *
5105  * This honors path->lowest_level to prevent descent past a given level
5106  * of the tree.
5107  *
5108  * min_trans indicates the oldest transaction that you are interested
5109  * in walking through.  Any nodes or leaves older than min_trans are
5110  * skipped over (without reading them).
5111  *
5112  * returns zero if something useful was found, < 0 on error and 1 if there
5113  * was nothing in the tree that matched the search criteria.
5114  */
5115 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5116                          struct btrfs_path *path,
5117                          u64 min_trans)
5118 {
5119         struct btrfs_fs_info *fs_info = root->fs_info;
5120         struct extent_buffer *cur;
5121         struct btrfs_key found_key;
5122         int slot;
5123         int sret;
5124         u32 nritems;
5125         int level;
5126         int ret = 1;
5127         int keep_locks = path->keep_locks;
5128
5129         path->keep_locks = 1;
5130 again:
5131         cur = btrfs_read_lock_root_node(root);
5132         level = btrfs_header_level(cur);
5133         WARN_ON(path->nodes[level]);
5134         path->nodes[level] = cur;
5135         path->locks[level] = BTRFS_READ_LOCK;
5136
5137         if (btrfs_header_generation(cur) < min_trans) {
5138                 ret = 1;
5139                 goto out;
5140         }
5141         while (1) {
5142                 nritems = btrfs_header_nritems(cur);
5143                 level = btrfs_header_level(cur);
5144                 sret = btrfs_bin_search(cur, min_key, level, &slot);
5145
5146                 /* at the lowest level, we're done, setup the path and exit */
5147                 if (level == path->lowest_level) {
5148                         if (slot >= nritems)
5149                                 goto find_next_key;
5150                         ret = 0;
5151                         path->slots[level] = slot;
5152                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5153                         goto out;
5154                 }
5155                 if (sret && slot > 0)
5156                         slot--;
5157                 /*
5158                  * check this node pointer against the min_trans parameters.
5159                  * If it is too old, old, skip to the next one.
5160                  */
5161                 while (slot < nritems) {
5162                         u64 gen;
5163
5164                         gen = btrfs_node_ptr_generation(cur, slot);
5165                         if (gen < min_trans) {
5166                                 slot++;
5167                                 continue;
5168                         }
5169                         break;
5170                 }
5171 find_next_key:
5172                 /*
5173                  * we didn't find a candidate key in this node, walk forward
5174                  * and find another one
5175                  */
5176                 if (slot >= nritems) {
5177                         path->slots[level] = slot;
5178                         btrfs_set_path_blocking(path);
5179                         sret = btrfs_find_next_key(root, path, min_key, level,
5180                                                   min_trans);
5181                         if (sret == 0) {
5182                                 btrfs_release_path(path);
5183                                 goto again;
5184                         } else {
5185                                 goto out;
5186                         }
5187                 }
5188                 /* save our key for returning back */
5189                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5190                 path->slots[level] = slot;
5191                 if (level == path->lowest_level) {
5192                         ret = 0;
5193                         goto out;
5194                 }
5195                 btrfs_set_path_blocking(path);
5196                 cur = read_node_slot(fs_info, cur, slot);
5197                 if (IS_ERR(cur)) {
5198                         ret = PTR_ERR(cur);
5199                         goto out;
5200                 }
5201
5202                 btrfs_tree_read_lock(cur);
5203
5204                 path->locks[level - 1] = BTRFS_READ_LOCK;
5205                 path->nodes[level - 1] = cur;
5206                 unlock_up(path, level, 1, 0, NULL);
5207                 btrfs_clear_path_blocking(path, NULL, 0);
5208         }
5209 out:
5210         path->keep_locks = keep_locks;
5211         if (ret == 0) {
5212                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5213                 btrfs_set_path_blocking(path);
5214                 memcpy(min_key, &found_key, sizeof(found_key));
5215         }
5216         return ret;
5217 }
5218
5219 static int tree_move_down(struct btrfs_fs_info *fs_info,
5220                            struct btrfs_path *path,
5221                            int *level)
5222 {
5223         struct extent_buffer *eb;
5224
5225         BUG_ON(*level == 0);
5226         eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5227         if (IS_ERR(eb))
5228                 return PTR_ERR(eb);
5229
5230         path->nodes[*level - 1] = eb;
5231         path->slots[*level - 1] = 0;
5232         (*level)--;
5233         return 0;
5234 }
5235
5236 static int tree_move_next_or_upnext(struct btrfs_path *path,
5237                                     int *level, int root_level)
5238 {
5239         int ret = 0;
5240         int nritems;
5241         nritems = btrfs_header_nritems(path->nodes[*level]);
5242
5243         path->slots[*level]++;
5244
5245         while (path->slots[*level] >= nritems) {
5246                 if (*level == root_level)
5247                         return -1;
5248
5249                 /* move upnext */
5250                 path->slots[*level] = 0;
5251                 free_extent_buffer(path->nodes[*level]);
5252                 path->nodes[*level] = NULL;
5253                 (*level)++;
5254                 path->slots[*level]++;
5255
5256                 nritems = btrfs_header_nritems(path->nodes[*level]);
5257                 ret = 1;
5258         }
5259         return ret;
5260 }
5261
5262 /*
5263  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5264  * or down.
5265  */
5266 static int tree_advance(struct btrfs_fs_info *fs_info,
5267                         struct btrfs_path *path,
5268                         int *level, int root_level,
5269                         int allow_down,
5270                         struct btrfs_key *key)
5271 {
5272         int ret;
5273
5274         if (*level == 0 || !allow_down) {
5275                 ret = tree_move_next_or_upnext(path, level, root_level);
5276         } else {
5277                 ret = tree_move_down(fs_info, path, level);
5278         }
5279         if (ret >= 0) {
5280                 if (*level == 0)
5281                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5282                                         path->slots[*level]);
5283                 else
5284                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5285                                         path->slots[*level]);
5286         }
5287         return ret;
5288 }
5289
5290 static int tree_compare_item(struct btrfs_path *left_path,
5291                              struct btrfs_path *right_path,
5292                              char *tmp_buf)
5293 {
5294         int cmp;
5295         int len1, len2;
5296         unsigned long off1, off2;
5297
5298         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5299         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5300         if (len1 != len2)
5301                 return 1;
5302
5303         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5304         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5305                                 right_path->slots[0]);
5306
5307         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5308
5309         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5310         if (cmp)
5311                 return 1;
5312         return 0;
5313 }
5314
5315 #define ADVANCE 1
5316 #define ADVANCE_ONLY_NEXT -1
5317
5318 /*
5319  * This function compares two trees and calls the provided callback for
5320  * every changed/new/deleted item it finds.
5321  * If shared tree blocks are encountered, whole subtrees are skipped, making
5322  * the compare pretty fast on snapshotted subvolumes.
5323  *
5324  * This currently works on commit roots only. As commit roots are read only,
5325  * we don't do any locking. The commit roots are protected with transactions.
5326  * Transactions are ended and rejoined when a commit is tried in between.
5327  *
5328  * This function checks for modifications done to the trees while comparing.
5329  * If it detects a change, it aborts immediately.
5330  */
5331 int btrfs_compare_trees(struct btrfs_root *left_root,
5332                         struct btrfs_root *right_root,
5333                         btrfs_changed_cb_t changed_cb, void *ctx)
5334 {
5335         struct btrfs_fs_info *fs_info = left_root->fs_info;
5336         int ret;
5337         int cmp;
5338         struct btrfs_path *left_path = NULL;
5339         struct btrfs_path *right_path = NULL;
5340         struct btrfs_key left_key;
5341         struct btrfs_key right_key;
5342         char *tmp_buf = NULL;
5343         int left_root_level;
5344         int right_root_level;
5345         int left_level;
5346         int right_level;
5347         int left_end_reached;
5348         int right_end_reached;
5349         int advance_left;
5350         int advance_right;
5351         u64 left_blockptr;
5352         u64 right_blockptr;
5353         u64 left_gen;
5354         u64 right_gen;
5355
5356         left_path = btrfs_alloc_path();
5357         if (!left_path) {
5358                 ret = -ENOMEM;
5359                 goto out;
5360         }
5361         right_path = btrfs_alloc_path();
5362         if (!right_path) {
5363                 ret = -ENOMEM;
5364                 goto out;
5365         }
5366
5367         tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5368         if (!tmp_buf) {
5369                 ret = -ENOMEM;
5370                 goto out;
5371         }
5372
5373         left_path->search_commit_root = 1;
5374         left_path->skip_locking = 1;
5375         right_path->search_commit_root = 1;
5376         right_path->skip_locking = 1;
5377
5378         /*
5379          * Strategy: Go to the first items of both trees. Then do
5380          *
5381          * If both trees are at level 0
5382          *   Compare keys of current items
5383          *     If left < right treat left item as new, advance left tree
5384          *       and repeat
5385          *     If left > right treat right item as deleted, advance right tree
5386          *       and repeat
5387          *     If left == right do deep compare of items, treat as changed if
5388          *       needed, advance both trees and repeat
5389          * If both trees are at the same level but not at level 0
5390          *   Compare keys of current nodes/leafs
5391          *     If left < right advance left tree and repeat
5392          *     If left > right advance right tree and repeat
5393          *     If left == right compare blockptrs of the next nodes/leafs
5394          *       If they match advance both trees but stay at the same level
5395          *         and repeat
5396          *       If they don't match advance both trees while allowing to go
5397          *         deeper and repeat
5398          * If tree levels are different
5399          *   Advance the tree that needs it and repeat
5400          *
5401          * Advancing a tree means:
5402          *   If we are at level 0, try to go to the next slot. If that's not
5403          *   possible, go one level up and repeat. Stop when we found a level
5404          *   where we could go to the next slot. We may at this point be on a
5405          *   node or a leaf.
5406          *
5407          *   If we are not at level 0 and not on shared tree blocks, go one
5408          *   level deeper.
5409          *
5410          *   If we are not at level 0 and on shared tree blocks, go one slot to
5411          *   the right if possible or go up and right.
5412          */
5413
5414         down_read(&fs_info->commit_root_sem);
5415         left_level = btrfs_header_level(left_root->commit_root);
5416         left_root_level = left_level;
5417         left_path->nodes[left_level] = left_root->commit_root;
5418         extent_buffer_get(left_path->nodes[left_level]);
5419
5420         right_level = btrfs_header_level(right_root->commit_root);
5421         right_root_level = right_level;
5422         right_path->nodes[right_level] = right_root->commit_root;
5423         extent_buffer_get(right_path->nodes[right_level]);
5424         up_read(&fs_info->commit_root_sem);
5425
5426         if (left_level == 0)
5427                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5428                                 &left_key, left_path->slots[left_level]);
5429         else
5430                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5431                                 &left_key, left_path->slots[left_level]);
5432         if (right_level == 0)
5433                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5434                                 &right_key, right_path->slots[right_level]);
5435         else
5436                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5437                                 &right_key, right_path->slots[right_level]);
5438
5439         left_end_reached = right_end_reached = 0;
5440         advance_left = advance_right = 0;
5441
5442         while (1) {
5443                 if (advance_left && !left_end_reached) {
5444                         ret = tree_advance(fs_info, left_path, &left_level,
5445                                         left_root_level,
5446                                         advance_left != ADVANCE_ONLY_NEXT,
5447                                         &left_key);
5448                         if (ret == -1)
5449                                 left_end_reached = ADVANCE;
5450                         else if (ret < 0)
5451                                 goto out;
5452                         advance_left = 0;
5453                 }
5454                 if (advance_right && !right_end_reached) {
5455                         ret = tree_advance(fs_info, right_path, &right_level,
5456                                         right_root_level,
5457                                         advance_right != ADVANCE_ONLY_NEXT,
5458                                         &right_key);
5459                         if (ret == -1)
5460                                 right_end_reached = ADVANCE;
5461                         else if (ret < 0)
5462                                 goto out;
5463                         advance_right = 0;
5464                 }
5465
5466                 if (left_end_reached && right_end_reached) {
5467                         ret = 0;
5468                         goto out;
5469                 } else if (left_end_reached) {
5470                         if (right_level == 0) {
5471                                 ret = changed_cb(left_path, right_path,
5472                                                 &right_key,
5473                                                 BTRFS_COMPARE_TREE_DELETED,
5474                                                 ctx);
5475                                 if (ret < 0)
5476                                         goto out;
5477                         }
5478                         advance_right = ADVANCE;
5479                         continue;
5480                 } else if (right_end_reached) {
5481                         if (left_level == 0) {
5482                                 ret = changed_cb(left_path, right_path,
5483                                                 &left_key,
5484                                                 BTRFS_COMPARE_TREE_NEW,
5485                                                 ctx);
5486                                 if (ret < 0)
5487                                         goto out;
5488                         }
5489                         advance_left = ADVANCE;
5490                         continue;
5491                 }
5492
5493                 if (left_level == 0 && right_level == 0) {
5494                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5495                         if (cmp < 0) {
5496                                 ret = changed_cb(left_path, right_path,
5497                                                 &left_key,
5498                                                 BTRFS_COMPARE_TREE_NEW,
5499                                                 ctx);
5500                                 if (ret < 0)
5501                                         goto out;
5502                                 advance_left = ADVANCE;
5503                         } else if (cmp > 0) {
5504                                 ret = changed_cb(left_path, right_path,
5505                                                 &right_key,
5506                                                 BTRFS_COMPARE_TREE_DELETED,
5507                                                 ctx);
5508                                 if (ret < 0)
5509                                         goto out;
5510                                 advance_right = ADVANCE;
5511                         } else {
5512                                 enum btrfs_compare_tree_result result;
5513
5514                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5515                                 ret = tree_compare_item(left_path, right_path,
5516                                                         tmp_buf);
5517                                 if (ret)
5518                                         result = BTRFS_COMPARE_TREE_CHANGED;
5519                                 else
5520                                         result = BTRFS_COMPARE_TREE_SAME;
5521                                 ret = changed_cb(left_path, right_path,
5522                                                  &left_key, result, ctx);
5523                                 if (ret < 0)
5524                                         goto out;
5525                                 advance_left = ADVANCE;
5526                                 advance_right = ADVANCE;
5527                         }
5528                 } else if (left_level == right_level) {
5529                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5530                         if (cmp < 0) {
5531                                 advance_left = ADVANCE;
5532                         } else if (cmp > 0) {
5533                                 advance_right = ADVANCE;
5534                         } else {
5535                                 left_blockptr = btrfs_node_blockptr(
5536                                                 left_path->nodes[left_level],
5537                                                 left_path->slots[left_level]);
5538                                 right_blockptr = btrfs_node_blockptr(
5539                                                 right_path->nodes[right_level],
5540                                                 right_path->slots[right_level]);
5541                                 left_gen = btrfs_node_ptr_generation(
5542                                                 left_path->nodes[left_level],
5543                                                 left_path->slots[left_level]);
5544                                 right_gen = btrfs_node_ptr_generation(
5545                                                 right_path->nodes[right_level],
5546                                                 right_path->slots[right_level]);
5547                                 if (left_blockptr == right_blockptr &&
5548                                     left_gen == right_gen) {
5549                                         /*
5550                                          * As we're on a shared block, don't
5551                                          * allow to go deeper.
5552                                          */
5553                                         advance_left = ADVANCE_ONLY_NEXT;
5554                                         advance_right = ADVANCE_ONLY_NEXT;
5555                                 } else {
5556                                         advance_left = ADVANCE;
5557                                         advance_right = ADVANCE;
5558                                 }
5559                         }
5560                 } else if (left_level < right_level) {
5561                         advance_right = ADVANCE;
5562                 } else {
5563                         advance_left = ADVANCE;
5564                 }
5565         }
5566
5567 out:
5568         btrfs_free_path(left_path);
5569         btrfs_free_path(right_path);
5570         kvfree(tmp_buf);
5571         return ret;
5572 }
5573
5574 /*
5575  * this is similar to btrfs_next_leaf, but does not try to preserve
5576  * and fixup the path.  It looks for and returns the next key in the
5577  * tree based on the current path and the min_trans parameters.
5578  *
5579  * 0 is returned if another key is found, < 0 if there are any errors
5580  * and 1 is returned if there are no higher keys in the tree
5581  *
5582  * path->keep_locks should be set to 1 on the search made before
5583  * calling this function.
5584  */
5585 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5586                         struct btrfs_key *key, int level, u64 min_trans)
5587 {
5588         int slot;
5589         struct extent_buffer *c;
5590
5591         WARN_ON(!path->keep_locks);
5592         while (level < BTRFS_MAX_LEVEL) {
5593                 if (!path->nodes[level])
5594                         return 1;
5595
5596                 slot = path->slots[level] + 1;
5597                 c = path->nodes[level];
5598 next:
5599                 if (slot >= btrfs_header_nritems(c)) {
5600                         int ret;
5601                         int orig_lowest;
5602                         struct btrfs_key cur_key;
5603                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5604                             !path->nodes[level + 1])
5605                                 return 1;
5606
5607                         if (path->locks[level + 1]) {
5608                                 level++;
5609                                 continue;
5610                         }
5611
5612                         slot = btrfs_header_nritems(c) - 1;
5613                         if (level == 0)
5614                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5615                         else
5616                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5617
5618                         orig_lowest = path->lowest_level;
5619                         btrfs_release_path(path);
5620                         path->lowest_level = level;
5621                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5622                                                 0, 0);
5623                         path->lowest_level = orig_lowest;
5624                         if (ret < 0)
5625                                 return ret;
5626
5627                         c = path->nodes[level];
5628                         slot = path->slots[level];
5629                         if (ret == 0)
5630                                 slot++;
5631                         goto next;
5632                 }
5633
5634                 if (level == 0)
5635                         btrfs_item_key_to_cpu(c, key, slot);
5636                 else {
5637                         u64 gen = btrfs_node_ptr_generation(c, slot);
5638
5639                         if (gen < min_trans) {
5640                                 slot++;
5641                                 goto next;
5642                         }
5643                         btrfs_node_key_to_cpu(c, key, slot);
5644                 }
5645                 return 0;
5646         }
5647         return 1;
5648 }
5649
5650 /*
5651  * search the tree again to find a leaf with greater keys
5652  * returns 0 if it found something or 1 if there are no greater leaves.
5653  * returns < 0 on io errors.
5654  */
5655 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5656 {
5657         return btrfs_next_old_leaf(root, path, 0);
5658 }
5659
5660 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5661                         u64 time_seq)
5662 {
5663         int slot;
5664         int level;
5665         struct extent_buffer *c;
5666         struct extent_buffer *next;
5667         struct btrfs_key key;
5668         u32 nritems;
5669         int ret;
5670         int old_spinning = path->leave_spinning;
5671         int next_rw_lock = 0;
5672
5673         nritems = btrfs_header_nritems(path->nodes[0]);
5674         if (nritems == 0)
5675                 return 1;
5676
5677         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5678 again:
5679         level = 1;
5680         next = NULL;
5681         next_rw_lock = 0;
5682         btrfs_release_path(path);
5683
5684         path->keep_locks = 1;
5685         path->leave_spinning = 1;
5686
5687         if (time_seq)
5688                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5689         else
5690                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5691         path->keep_locks = 0;
5692
5693         if (ret < 0)
5694                 return ret;
5695
5696         nritems = btrfs_header_nritems(path->nodes[0]);
5697         /*
5698          * by releasing the path above we dropped all our locks.  A balance
5699          * could have added more items next to the key that used to be
5700          * at the very end of the block.  So, check again here and
5701          * advance the path if there are now more items available.
5702          */
5703         if (nritems > 0 && path->slots[0] < nritems - 1) {
5704                 if (ret == 0)
5705                         path->slots[0]++;
5706                 ret = 0;
5707                 goto done;
5708         }
5709         /*
5710          * So the above check misses one case:
5711          * - after releasing the path above, someone has removed the item that
5712          *   used to be at the very end of the block, and balance between leafs
5713          *   gets another one with bigger key.offset to replace it.
5714          *
5715          * This one should be returned as well, or we can get leaf corruption
5716          * later(esp. in __btrfs_drop_extents()).
5717          *
5718          * And a bit more explanation about this check,
5719          * with ret > 0, the key isn't found, the path points to the slot
5720          * where it should be inserted, so the path->slots[0] item must be the
5721          * bigger one.
5722          */
5723         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5724                 ret = 0;
5725                 goto done;
5726         }
5727
5728         while (level < BTRFS_MAX_LEVEL) {
5729                 if (!path->nodes[level]) {
5730                         ret = 1;
5731                         goto done;
5732                 }
5733
5734                 slot = path->slots[level] + 1;
5735                 c = path->nodes[level];
5736                 if (slot >= btrfs_header_nritems(c)) {
5737                         level++;
5738                         if (level == BTRFS_MAX_LEVEL) {
5739                                 ret = 1;
5740                                 goto done;
5741                         }
5742                         continue;
5743                 }
5744
5745                 if (next) {
5746                         btrfs_tree_unlock_rw(next, next_rw_lock);
5747                         free_extent_buffer(next);
5748                 }
5749
5750                 next = c;
5751                 next_rw_lock = path->locks[level];
5752                 ret = read_block_for_search(root, path, &next, level,
5753                                             slot, &key);
5754                 if (ret == -EAGAIN)
5755                         goto again;
5756
5757                 if (ret < 0) {
5758                         btrfs_release_path(path);
5759                         goto done;
5760                 }
5761
5762                 if (!path->skip_locking) {
5763                         ret = btrfs_try_tree_read_lock(next);
5764                         if (!ret && time_seq) {
5765                                 /*
5766                                  * If we don't get the lock, we may be racing
5767                                  * with push_leaf_left, holding that lock while
5768                                  * itself waiting for the leaf we've currently
5769                                  * locked. To solve this situation, we give up
5770                                  * on our lock and cycle.
5771                                  */
5772                                 free_extent_buffer(next);
5773                                 btrfs_release_path(path);
5774                                 cond_resched();
5775                                 goto again;
5776                         }
5777                         if (!ret) {
5778                                 btrfs_set_path_blocking(path);
5779                                 btrfs_tree_read_lock(next);
5780                                 btrfs_clear_path_blocking(path, next,
5781                                                           BTRFS_READ_LOCK);
5782                         }
5783                         next_rw_lock = BTRFS_READ_LOCK;
5784                 }
5785                 break;
5786         }
5787         path->slots[level] = slot;
5788         while (1) {
5789                 level--;
5790                 c = path->nodes[level];
5791                 if (path->locks[level])
5792                         btrfs_tree_unlock_rw(c, path->locks[level]);
5793
5794                 free_extent_buffer(c);
5795                 path->nodes[level] = next;
5796                 path->slots[level] = 0;
5797                 if (!path->skip_locking)
5798                         path->locks[level] = next_rw_lock;
5799                 if (!level)
5800                         break;
5801
5802                 ret = read_block_for_search(root, path, &next, level,
5803                                             0, &key);
5804                 if (ret == -EAGAIN)
5805                         goto again;
5806
5807                 if (ret < 0) {
5808                         btrfs_release_path(path);
5809                         goto done;
5810                 }
5811
5812                 if (!path->skip_locking) {
5813                         ret = btrfs_try_tree_read_lock(next);
5814                         if (!ret) {
5815                                 btrfs_set_path_blocking(path);
5816                                 btrfs_tree_read_lock(next);
5817                                 btrfs_clear_path_blocking(path, next,
5818                                                           BTRFS_READ_LOCK);
5819                         }
5820                         next_rw_lock = BTRFS_READ_LOCK;
5821                 }
5822         }
5823         ret = 0;
5824 done:
5825         unlock_up(path, 0, 1, 0, NULL);
5826         path->leave_spinning = old_spinning;
5827         if (!old_spinning)
5828                 btrfs_set_path_blocking(path);
5829
5830         return ret;
5831 }
5832
5833 /*
5834  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5835  * searching until it gets past min_objectid or finds an item of 'type'
5836  *
5837  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5838  */
5839 int btrfs_previous_item(struct btrfs_root *root,
5840                         struct btrfs_path *path, u64 min_objectid,
5841                         int type)
5842 {
5843         struct btrfs_key found_key;
5844         struct extent_buffer *leaf;
5845         u32 nritems;
5846         int ret;
5847
5848         while (1) {
5849                 if (path->slots[0] == 0) {
5850                         btrfs_set_path_blocking(path);
5851                         ret = btrfs_prev_leaf(root, path);
5852                         if (ret != 0)
5853                                 return ret;
5854                 } else {
5855                         path->slots[0]--;
5856                 }
5857                 leaf = path->nodes[0];
5858                 nritems = btrfs_header_nritems(leaf);
5859                 if (nritems == 0)
5860                         return 1;
5861                 if (path->slots[0] == nritems)
5862                         path->slots[0]--;
5863
5864                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5865                 if (found_key.objectid < min_objectid)
5866                         break;
5867                 if (found_key.type == type)
5868                         return 0;
5869                 if (found_key.objectid == min_objectid &&
5870                     found_key.type < type)
5871                         break;
5872         }
5873         return 1;
5874 }
5875
5876 /*
5877  * search in extent tree to find a previous Metadata/Data extent item with
5878  * min objecitd.
5879  *
5880  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5881  */
5882 int btrfs_previous_extent_item(struct btrfs_root *root,
5883                         struct btrfs_path *path, u64 min_objectid)
5884 {
5885         struct btrfs_key found_key;
5886         struct extent_buffer *leaf;
5887         u32 nritems;
5888         int ret;
5889
5890         while (1) {
5891                 if (path->slots[0] == 0) {
5892                         btrfs_set_path_blocking(path);
5893                         ret = btrfs_prev_leaf(root, path);
5894                         if (ret != 0)
5895                                 return ret;
5896                 } else {
5897                         path->slots[0]--;
5898                 }
5899                 leaf = path->nodes[0];
5900                 nritems = btrfs_header_nritems(leaf);
5901                 if (nritems == 0)
5902                         return 1;
5903                 if (path->slots[0] == nritems)
5904                         path->slots[0]--;
5905
5906                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5907                 if (found_key.objectid < min_objectid)
5908                         break;
5909                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5910                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5911                         return 0;
5912                 if (found_key.objectid == min_objectid &&
5913                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5914                         break;
5915         }
5916         return 1;
5917 }