Btrfs: fix spinlock assertions on UP systems
[sfrench/cifs-2.6.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "disk-io.h"
22 #include "transaction.h"
23 #include "print-tree.h"
24 #include "locking.h"
25
26 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27                       *root, struct btrfs_path *path, int level);
28 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_key *ins_key,
30                       struct btrfs_path *path, int data_size, int extend);
31 static int push_node_left(struct btrfs_trans_handle *trans,
32                           struct btrfs_root *root, struct extent_buffer *dst,
33                           struct extent_buffer *src, int empty);
34 static int balance_node_right(struct btrfs_trans_handle *trans,
35                               struct btrfs_root *root,
36                               struct extent_buffer *dst_buf,
37                               struct extent_buffer *src_buf);
38 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39                    struct btrfs_path *path, int level, int slot);
40
41 struct btrfs_path *btrfs_alloc_path(void)
42 {
43         struct btrfs_path *path;
44         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
45         if (path)
46                 path->reada = 1;
47         return path;
48 }
49
50 /*
51  * set all locked nodes in the path to blocking locks.  This should
52  * be done before scheduling
53  */
54 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
55 {
56         int i;
57         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
58                 if (p->nodes[i] && p->locks[i])
59                         btrfs_set_lock_blocking(p->nodes[i]);
60         }
61 }
62
63 /*
64  * reset all the locked nodes in the patch to spinning locks.
65  *
66  * held is used to keep lockdep happy, when lockdep is enabled
67  * we set held to a blocking lock before we go around and
68  * retake all the spinlocks in the path.  You can safely use NULL
69  * for held
70  */
71 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
72                                         struct extent_buffer *held)
73 {
74         int i;
75
76 #ifdef CONFIG_DEBUG_LOCK_ALLOC
77         /* lockdep really cares that we take all of these spinlocks
78          * in the right order.  If any of the locks in the path are not
79          * currently blocking, it is going to complain.  So, make really
80          * really sure by forcing the path to blocking before we clear
81          * the path blocking.
82          */
83         if (held)
84                 btrfs_set_lock_blocking(held);
85         btrfs_set_path_blocking(p);
86 #endif
87
88         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89                 if (p->nodes[i] && p->locks[i])
90                         btrfs_clear_lock_blocking(p->nodes[i]);
91         }
92
93 #ifdef CONFIG_DEBUG_LOCK_ALLOC
94         if (held)
95                 btrfs_clear_lock_blocking(held);
96 #endif
97 }
98
99 /* this also releases the path */
100 void btrfs_free_path(struct btrfs_path *p)
101 {
102         btrfs_release_path(NULL, p);
103         kmem_cache_free(btrfs_path_cachep, p);
104 }
105
106 /*
107  * path release drops references on the extent buffers in the path
108  * and it drops any locks held by this path
109  *
110  * It is safe to call this on paths that no locks or extent buffers held.
111  */
112 noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
113 {
114         int i;
115
116         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
117                 p->slots[i] = 0;
118                 if (!p->nodes[i])
119                         continue;
120                 if (p->locks[i]) {
121                         btrfs_tree_unlock(p->nodes[i]);
122                         p->locks[i] = 0;
123                 }
124                 free_extent_buffer(p->nodes[i]);
125                 p->nodes[i] = NULL;
126         }
127 }
128
129 /*
130  * safely gets a reference on the root node of a tree.  A lock
131  * is not taken, so a concurrent writer may put a different node
132  * at the root of the tree.  See btrfs_lock_root_node for the
133  * looping required.
134  *
135  * The extent buffer returned by this has a reference taken, so
136  * it won't disappear.  It may stop being the root of the tree
137  * at any time because there are no locks held.
138  */
139 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
140 {
141         struct extent_buffer *eb;
142         spin_lock(&root->node_lock);
143         eb = root->node;
144         extent_buffer_get(eb);
145         spin_unlock(&root->node_lock);
146         return eb;
147 }
148
149 /* loop around taking references on and locking the root node of the
150  * tree until you end up with a lock on the root.  A locked buffer
151  * is returned, with a reference held.
152  */
153 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
154 {
155         struct extent_buffer *eb;
156
157         while (1) {
158                 eb = btrfs_root_node(root);
159                 btrfs_tree_lock(eb);
160
161                 spin_lock(&root->node_lock);
162                 if (eb == root->node) {
163                         spin_unlock(&root->node_lock);
164                         break;
165                 }
166                 spin_unlock(&root->node_lock);
167
168                 btrfs_tree_unlock(eb);
169                 free_extent_buffer(eb);
170         }
171         return eb;
172 }
173
174 /* cowonly root (everything not a reference counted cow subvolume), just get
175  * put onto a simple dirty list.  transaction.c walks this to make sure they
176  * get properly updated on disk.
177  */
178 static void add_root_to_dirty_list(struct btrfs_root *root)
179 {
180         if (root->track_dirty && list_empty(&root->dirty_list)) {
181                 list_add(&root->dirty_list,
182                          &root->fs_info->dirty_cowonly_roots);
183         }
184 }
185
186 /*
187  * used by snapshot creation to make a copy of a root for a tree with
188  * a given objectid.  The buffer with the new root node is returned in
189  * cow_ret, and this func returns zero on success or a negative error code.
190  */
191 int btrfs_copy_root(struct btrfs_trans_handle *trans,
192                       struct btrfs_root *root,
193                       struct extent_buffer *buf,
194                       struct extent_buffer **cow_ret, u64 new_root_objectid)
195 {
196         struct extent_buffer *cow;
197         u32 nritems;
198         int ret = 0;
199         int level;
200         struct btrfs_root *new_root;
201
202         new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
203         if (!new_root)
204                 return -ENOMEM;
205
206         memcpy(new_root, root, sizeof(*new_root));
207         new_root->root_key.objectid = new_root_objectid;
208
209         WARN_ON(root->ref_cows && trans->transid !=
210                 root->fs_info->running_transaction->transid);
211         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
212
213         level = btrfs_header_level(buf);
214         nritems = btrfs_header_nritems(buf);
215
216         cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
217                                      new_root_objectid, trans->transid,
218                                      level, buf->start, 0);
219         if (IS_ERR(cow)) {
220                 kfree(new_root);
221                 return PTR_ERR(cow);
222         }
223
224         copy_extent_buffer(cow, buf, 0, 0, cow->len);
225         btrfs_set_header_bytenr(cow, cow->start);
226         btrfs_set_header_generation(cow, trans->transid);
227         btrfs_set_header_owner(cow, new_root_objectid);
228         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
229
230         write_extent_buffer(cow, root->fs_info->fsid,
231                             (unsigned long)btrfs_header_fsid(cow),
232                             BTRFS_FSID_SIZE);
233
234         WARN_ON(btrfs_header_generation(buf) > trans->transid);
235         ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
236         kfree(new_root);
237
238         if (ret)
239                 return ret;
240
241         btrfs_mark_buffer_dirty(cow);
242         *cow_ret = cow;
243         return 0;
244 }
245
246 /*
247  * does the dirty work in cow of a single block.  The parent block (if
248  * supplied) is updated to point to the new cow copy.  The new buffer is marked
249  * dirty and returned locked.  If you modify the block it needs to be marked
250  * dirty again.
251  *
252  * search_start -- an allocation hint for the new block
253  *
254  * empty_size -- a hint that you plan on doing more cow.  This is the size in
255  * bytes the allocator should try to find free next to the block it returns.
256  * This is just a hint and may be ignored by the allocator.
257  *
258  * prealloc_dest -- if you have already reserved a destination for the cow,
259  * this uses that block instead of allocating a new one.
260  * btrfs_alloc_reserved_extent is used to finish the allocation.
261  */
262 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
263                              struct btrfs_root *root,
264                              struct extent_buffer *buf,
265                              struct extent_buffer *parent, int parent_slot,
266                              struct extent_buffer **cow_ret,
267                              u64 search_start, u64 empty_size,
268                              u64 prealloc_dest)
269 {
270         u64 parent_start;
271         struct extent_buffer *cow;
272         u32 nritems;
273         int ret = 0;
274         int level;
275         int unlock_orig = 0;
276
277         if (*cow_ret == buf)
278                 unlock_orig = 1;
279
280         btrfs_assert_tree_locked(buf);
281
282         if (parent)
283                 parent_start = parent->start;
284         else
285                 parent_start = 0;
286
287         WARN_ON(root->ref_cows && trans->transid !=
288                 root->fs_info->running_transaction->transid);
289         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
290
291         level = btrfs_header_level(buf);
292         nritems = btrfs_header_nritems(buf);
293
294         if (prealloc_dest) {
295                 struct btrfs_key ins;
296
297                 ins.objectid = prealloc_dest;
298                 ins.offset = buf->len;
299                 ins.type = BTRFS_EXTENT_ITEM_KEY;
300
301                 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
302                                                   root->root_key.objectid,
303                                                   trans->transid, level, &ins);
304                 BUG_ON(ret);
305                 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
306                                             buf->len, level);
307         } else {
308                 cow = btrfs_alloc_free_block(trans, root, buf->len,
309                                              parent_start,
310                                              root->root_key.objectid,
311                                              trans->transid, level,
312                                              search_start, empty_size);
313         }
314         if (IS_ERR(cow))
315                 return PTR_ERR(cow);
316
317         /* cow is set to blocking by btrfs_init_new_buffer */
318
319         copy_extent_buffer(cow, buf, 0, 0, cow->len);
320         btrfs_set_header_bytenr(cow, cow->start);
321         btrfs_set_header_generation(cow, trans->transid);
322         btrfs_set_header_owner(cow, root->root_key.objectid);
323         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
324
325         write_extent_buffer(cow, root->fs_info->fsid,
326                             (unsigned long)btrfs_header_fsid(cow),
327                             BTRFS_FSID_SIZE);
328
329         WARN_ON(btrfs_header_generation(buf) > trans->transid);
330         if (btrfs_header_generation(buf) != trans->transid) {
331                 u32 nr_extents;
332                 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
333                 if (ret)
334                         return ret;
335
336                 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
337                 WARN_ON(ret);
338         } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
339                 /*
340                  * There are only two places that can drop reference to
341                  * tree blocks owned by living reloc trees, one is here,
342                  * the other place is btrfs_drop_subtree. In both places,
343                  * we check reference count while tree block is locked.
344                  * Furthermore, if reference count is one, it won't get
345                  * increased by someone else.
346                  */
347                 u32 refs;
348                 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
349                                               buf->len, &refs);
350                 BUG_ON(ret);
351                 if (refs == 1) {
352                         ret = btrfs_update_ref(trans, root, buf, cow,
353                                                0, nritems);
354                         clean_tree_block(trans, root, buf);
355                 } else {
356                         ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
357                 }
358                 BUG_ON(ret);
359         } else {
360                 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
361                 if (ret)
362                         return ret;
363                 clean_tree_block(trans, root, buf);
364         }
365
366         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
367                 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
368                 WARN_ON(ret);
369         }
370
371         if (buf == root->node) {
372                 WARN_ON(parent && parent != buf);
373
374                 spin_lock(&root->node_lock);
375                 root->node = cow;
376                 extent_buffer_get(cow);
377                 spin_unlock(&root->node_lock);
378
379                 if (buf != root->commit_root) {
380                         btrfs_free_extent(trans, root, buf->start,
381                                           buf->len, buf->start,
382                                           root->root_key.objectid,
383                                           btrfs_header_generation(buf),
384                                           level, 1);
385                 }
386                 free_extent_buffer(buf);
387                 add_root_to_dirty_list(root);
388         } else {
389                 btrfs_set_node_blockptr(parent, parent_slot,
390                                         cow->start);
391                 WARN_ON(trans->transid == 0);
392                 btrfs_set_node_ptr_generation(parent, parent_slot,
393                                               trans->transid);
394                 btrfs_mark_buffer_dirty(parent);
395                 WARN_ON(btrfs_header_generation(parent) != trans->transid);
396                 btrfs_free_extent(trans, root, buf->start, buf->len,
397                                   parent_start, btrfs_header_owner(parent),
398                                   btrfs_header_generation(parent), level, 1);
399         }
400         if (unlock_orig)
401                 btrfs_tree_unlock(buf);
402         free_extent_buffer(buf);
403         btrfs_mark_buffer_dirty(cow);
404         *cow_ret = cow;
405         return 0;
406 }
407
408 /*
409  * cows a single block, see __btrfs_cow_block for the real work.
410  * This version of it has extra checks so that a block isn't cow'd more than
411  * once per transaction, as long as it hasn't been written yet
412  */
413 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
414                     struct btrfs_root *root, struct extent_buffer *buf,
415                     struct extent_buffer *parent, int parent_slot,
416                     struct extent_buffer **cow_ret, u64 prealloc_dest)
417 {
418         u64 search_start;
419         int ret;
420
421         if (trans->transaction != root->fs_info->running_transaction) {
422                 printk(KERN_CRIT "trans %llu running %llu\n",
423                        (unsigned long long)trans->transid,
424                        (unsigned long long)
425                        root->fs_info->running_transaction->transid);
426                 WARN_ON(1);
427         }
428         if (trans->transid != root->fs_info->generation) {
429                 printk(KERN_CRIT "trans %llu running %llu\n",
430                        (unsigned long long)trans->transid,
431                        (unsigned long long)root->fs_info->generation);
432                 WARN_ON(1);
433         }
434
435         if (btrfs_header_generation(buf) == trans->transid &&
436             btrfs_header_owner(buf) == root->root_key.objectid &&
437             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
438                 *cow_ret = buf;
439                 WARN_ON(prealloc_dest);
440                 return 0;
441         }
442
443         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
444
445         if (parent)
446                 btrfs_set_lock_blocking(parent);
447         btrfs_set_lock_blocking(buf);
448
449         ret = __btrfs_cow_block(trans, root, buf, parent,
450                                  parent_slot, cow_ret, search_start, 0,
451                                  prealloc_dest);
452         return ret;
453 }
454
455 /*
456  * helper function for defrag to decide if two blocks pointed to by a
457  * node are actually close by
458  */
459 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
460 {
461         if (blocknr < other && other - (blocknr + blocksize) < 32768)
462                 return 1;
463         if (blocknr > other && blocknr - (other + blocksize) < 32768)
464                 return 1;
465         return 0;
466 }
467
468 /*
469  * compare two keys in a memcmp fashion
470  */
471 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
472 {
473         struct btrfs_key k1;
474
475         btrfs_disk_key_to_cpu(&k1, disk);
476
477         if (k1.objectid > k2->objectid)
478                 return 1;
479         if (k1.objectid < k2->objectid)
480                 return -1;
481         if (k1.type > k2->type)
482                 return 1;
483         if (k1.type < k2->type)
484                 return -1;
485         if (k1.offset > k2->offset)
486                 return 1;
487         if (k1.offset < k2->offset)
488                 return -1;
489         return 0;
490 }
491
492 /*
493  * same as comp_keys only with two btrfs_key's
494  */
495 static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
496 {
497         if (k1->objectid > k2->objectid)
498                 return 1;
499         if (k1->objectid < k2->objectid)
500                 return -1;
501         if (k1->type > k2->type)
502                 return 1;
503         if (k1->type < k2->type)
504                 return -1;
505         if (k1->offset > k2->offset)
506                 return 1;
507         if (k1->offset < k2->offset)
508                 return -1;
509         return 0;
510 }
511
512 /*
513  * this is used by the defrag code to go through all the
514  * leaves pointed to by a node and reallocate them so that
515  * disk order is close to key order
516  */
517 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
518                        struct btrfs_root *root, struct extent_buffer *parent,
519                        int start_slot, int cache_only, u64 *last_ret,
520                        struct btrfs_key *progress)
521 {
522         struct extent_buffer *cur;
523         u64 blocknr;
524         u64 gen;
525         u64 search_start = *last_ret;
526         u64 last_block = 0;
527         u64 other;
528         u32 parent_nritems;
529         int end_slot;
530         int i;
531         int err = 0;
532         int parent_level;
533         int uptodate;
534         u32 blocksize;
535         int progress_passed = 0;
536         struct btrfs_disk_key disk_key;
537
538         parent_level = btrfs_header_level(parent);
539         if (cache_only && parent_level != 1)
540                 return 0;
541
542         if (trans->transaction != root->fs_info->running_transaction)
543                 WARN_ON(1);
544         if (trans->transid != root->fs_info->generation)
545                 WARN_ON(1);
546
547         parent_nritems = btrfs_header_nritems(parent);
548         blocksize = btrfs_level_size(root, parent_level - 1);
549         end_slot = parent_nritems;
550
551         if (parent_nritems == 1)
552                 return 0;
553
554         btrfs_set_lock_blocking(parent);
555
556         for (i = start_slot; i < end_slot; i++) {
557                 int close = 1;
558
559                 if (!parent->map_token) {
560                         map_extent_buffer(parent,
561                                         btrfs_node_key_ptr_offset(i),
562                                         sizeof(struct btrfs_key_ptr),
563                                         &parent->map_token, &parent->kaddr,
564                                         &parent->map_start, &parent->map_len,
565                                         KM_USER1);
566                 }
567                 btrfs_node_key(parent, &disk_key, i);
568                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
569                         continue;
570
571                 progress_passed = 1;
572                 blocknr = btrfs_node_blockptr(parent, i);
573                 gen = btrfs_node_ptr_generation(parent, i);
574                 if (last_block == 0)
575                         last_block = blocknr;
576
577                 if (i > 0) {
578                         other = btrfs_node_blockptr(parent, i - 1);
579                         close = close_blocks(blocknr, other, blocksize);
580                 }
581                 if (!close && i < end_slot - 2) {
582                         other = btrfs_node_blockptr(parent, i + 1);
583                         close = close_blocks(blocknr, other, blocksize);
584                 }
585                 if (close) {
586                         last_block = blocknr;
587                         continue;
588                 }
589                 if (parent->map_token) {
590                         unmap_extent_buffer(parent, parent->map_token,
591                                             KM_USER1);
592                         parent->map_token = NULL;
593                 }
594
595                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
596                 if (cur)
597                         uptodate = btrfs_buffer_uptodate(cur, gen);
598                 else
599                         uptodate = 0;
600                 if (!cur || !uptodate) {
601                         if (cache_only) {
602                                 free_extent_buffer(cur);
603                                 continue;
604                         }
605                         if (!cur) {
606                                 cur = read_tree_block(root, blocknr,
607                                                          blocksize, gen);
608                         } else if (!uptodate) {
609                                 btrfs_read_buffer(cur, gen);
610                         }
611                 }
612                 if (search_start == 0)
613                         search_start = last_block;
614
615                 btrfs_tree_lock(cur);
616                 btrfs_set_lock_blocking(cur);
617                 err = __btrfs_cow_block(trans, root, cur, parent, i,
618                                         &cur, search_start,
619                                         min(16 * blocksize,
620                                             (end_slot - i) * blocksize), 0);
621                 if (err) {
622                         btrfs_tree_unlock(cur);
623                         free_extent_buffer(cur);
624                         break;
625                 }
626                 search_start = cur->start;
627                 last_block = cur->start;
628                 *last_ret = search_start;
629                 btrfs_tree_unlock(cur);
630                 free_extent_buffer(cur);
631         }
632         if (parent->map_token) {
633                 unmap_extent_buffer(parent, parent->map_token,
634                                     KM_USER1);
635                 parent->map_token = NULL;
636         }
637         return err;
638 }
639
640 /*
641  * The leaf data grows from end-to-front in the node.
642  * this returns the address of the start of the last item,
643  * which is the stop of the leaf data stack
644  */
645 static inline unsigned int leaf_data_end(struct btrfs_root *root,
646                                          struct extent_buffer *leaf)
647 {
648         u32 nr = btrfs_header_nritems(leaf);
649         if (nr == 0)
650                 return BTRFS_LEAF_DATA_SIZE(root);
651         return btrfs_item_offset_nr(leaf, nr - 1);
652 }
653
654 /*
655  * extra debugging checks to make sure all the items in a key are
656  * well formed and in the proper order
657  */
658 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
659                       int level)
660 {
661         struct extent_buffer *parent = NULL;
662         struct extent_buffer *node = path->nodes[level];
663         struct btrfs_disk_key parent_key;
664         struct btrfs_disk_key node_key;
665         int parent_slot;
666         int slot;
667         struct btrfs_key cpukey;
668         u32 nritems = btrfs_header_nritems(node);
669
670         if (path->nodes[level + 1])
671                 parent = path->nodes[level + 1];
672
673         slot = path->slots[level];
674         BUG_ON(nritems == 0);
675         if (parent) {
676                 parent_slot = path->slots[level + 1];
677                 btrfs_node_key(parent, &parent_key, parent_slot);
678                 btrfs_node_key(node, &node_key, 0);
679                 BUG_ON(memcmp(&parent_key, &node_key,
680                               sizeof(struct btrfs_disk_key)));
681                 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
682                        btrfs_header_bytenr(node));
683         }
684         BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
685         if (slot != 0) {
686                 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
687                 btrfs_node_key(node, &node_key, slot);
688                 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
689         }
690         if (slot < nritems - 1) {
691                 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
692                 btrfs_node_key(node, &node_key, slot);
693                 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
694         }
695         return 0;
696 }
697
698 /*
699  * extra checking to make sure all the items in a leaf are
700  * well formed and in the proper order
701  */
702 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
703                       int level)
704 {
705         struct extent_buffer *leaf = path->nodes[level];
706         struct extent_buffer *parent = NULL;
707         int parent_slot;
708         struct btrfs_key cpukey;
709         struct btrfs_disk_key parent_key;
710         struct btrfs_disk_key leaf_key;
711         int slot = path->slots[0];
712
713         u32 nritems = btrfs_header_nritems(leaf);
714
715         if (path->nodes[level + 1])
716                 parent = path->nodes[level + 1];
717
718         if (nritems == 0)
719                 return 0;
720
721         if (parent) {
722                 parent_slot = path->slots[level + 1];
723                 btrfs_node_key(parent, &parent_key, parent_slot);
724                 btrfs_item_key(leaf, &leaf_key, 0);
725
726                 BUG_ON(memcmp(&parent_key, &leaf_key,
727                        sizeof(struct btrfs_disk_key)));
728                 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
729                        btrfs_header_bytenr(leaf));
730         }
731         if (slot != 0 && slot < nritems - 1) {
732                 btrfs_item_key(leaf, &leaf_key, slot);
733                 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
734                 if (comp_keys(&leaf_key, &cpukey) <= 0) {
735                         btrfs_print_leaf(root, leaf);
736                         printk(KERN_CRIT "slot %d offset bad key\n", slot);
737                         BUG_ON(1);
738                 }
739                 if (btrfs_item_offset_nr(leaf, slot - 1) !=
740                        btrfs_item_end_nr(leaf, slot)) {
741                         btrfs_print_leaf(root, leaf);
742                         printk(KERN_CRIT "slot %d offset bad\n", slot);
743                         BUG_ON(1);
744                 }
745         }
746         if (slot < nritems - 1) {
747                 btrfs_item_key(leaf, &leaf_key, slot);
748                 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
749                 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
750                 if (btrfs_item_offset_nr(leaf, slot) !=
751                         btrfs_item_end_nr(leaf, slot + 1)) {
752                         btrfs_print_leaf(root, leaf);
753                         printk(KERN_CRIT "slot %d offset bad\n", slot);
754                         BUG_ON(1);
755                 }
756         }
757         BUG_ON(btrfs_item_offset_nr(leaf, 0) +
758                btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
759         return 0;
760 }
761
762 static noinline int check_block(struct btrfs_root *root,
763                                 struct btrfs_path *path, int level)
764 {
765         return 0;
766         if (level == 0)
767                 return check_leaf(root, path, level);
768         return check_node(root, path, level);
769 }
770
771 /*
772  * search for key in the extent_buffer.  The items start at offset p,
773  * and they are item_size apart.  There are 'max' items in p.
774  *
775  * the slot in the array is returned via slot, and it points to
776  * the place where you would insert key if it is not found in
777  * the array.
778  *
779  * slot may point to max if the key is bigger than all of the keys
780  */
781 static noinline int generic_bin_search(struct extent_buffer *eb,
782                                        unsigned long p,
783                                        int item_size, struct btrfs_key *key,
784                                        int max, int *slot)
785 {
786         int low = 0;
787         int high = max;
788         int mid;
789         int ret;
790         struct btrfs_disk_key *tmp = NULL;
791         struct btrfs_disk_key unaligned;
792         unsigned long offset;
793         char *map_token = NULL;
794         char *kaddr = NULL;
795         unsigned long map_start = 0;
796         unsigned long map_len = 0;
797         int err;
798
799         while (low < high) {
800                 mid = (low + high) / 2;
801                 offset = p + mid * item_size;
802
803                 if (!map_token || offset < map_start ||
804                     (offset + sizeof(struct btrfs_disk_key)) >
805                     map_start + map_len) {
806                         if (map_token) {
807                                 unmap_extent_buffer(eb, map_token, KM_USER0);
808                                 map_token = NULL;
809                         }
810
811                         err = map_private_extent_buffer(eb, offset,
812                                                 sizeof(struct btrfs_disk_key),
813                                                 &map_token, &kaddr,
814                                                 &map_start, &map_len, KM_USER0);
815
816                         if (!err) {
817                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
818                                                         map_start);
819                         } else {
820                                 read_extent_buffer(eb, &unaligned,
821                                                    offset, sizeof(unaligned));
822                                 tmp = &unaligned;
823                         }
824
825                 } else {
826                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
827                                                         map_start);
828                 }
829                 ret = comp_keys(tmp, key);
830
831                 if (ret < 0)
832                         low = mid + 1;
833                 else if (ret > 0)
834                         high = mid;
835                 else {
836                         *slot = mid;
837                         if (map_token)
838                                 unmap_extent_buffer(eb, map_token, KM_USER0);
839                         return 0;
840                 }
841         }
842         *slot = low;
843         if (map_token)
844                 unmap_extent_buffer(eb, map_token, KM_USER0);
845         return 1;
846 }
847
848 /*
849  * simple bin_search frontend that does the right thing for
850  * leaves vs nodes
851  */
852 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
853                       int level, int *slot)
854 {
855         if (level == 0) {
856                 return generic_bin_search(eb,
857                                           offsetof(struct btrfs_leaf, items),
858                                           sizeof(struct btrfs_item),
859                                           key, btrfs_header_nritems(eb),
860                                           slot);
861         } else {
862                 return generic_bin_search(eb,
863                                           offsetof(struct btrfs_node, ptrs),
864                                           sizeof(struct btrfs_key_ptr),
865                                           key, btrfs_header_nritems(eb),
866                                           slot);
867         }
868         return -1;
869 }
870
871 /* given a node and slot number, this reads the blocks it points to.  The
872  * extent buffer is returned with a reference taken (but unlocked).
873  * NULL is returned on error.
874  */
875 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
876                                    struct extent_buffer *parent, int slot)
877 {
878         int level = btrfs_header_level(parent);
879         if (slot < 0)
880                 return NULL;
881         if (slot >= btrfs_header_nritems(parent))
882                 return NULL;
883
884         BUG_ON(level == 0);
885
886         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
887                        btrfs_level_size(root, level - 1),
888                        btrfs_node_ptr_generation(parent, slot));
889 }
890
891 /*
892  * node level balancing, used to make sure nodes are in proper order for
893  * item deletion.  We balance from the top down, so we have to make sure
894  * that a deletion won't leave an node completely empty later on.
895  */
896 static noinline int balance_level(struct btrfs_trans_handle *trans,
897                          struct btrfs_root *root,
898                          struct btrfs_path *path, int level)
899 {
900         struct extent_buffer *right = NULL;
901         struct extent_buffer *mid;
902         struct extent_buffer *left = NULL;
903         struct extent_buffer *parent = NULL;
904         int ret = 0;
905         int wret;
906         int pslot;
907         int orig_slot = path->slots[level];
908         int err_on_enospc = 0;
909         u64 orig_ptr;
910
911         if (level == 0)
912                 return 0;
913
914         mid = path->nodes[level];
915
916         WARN_ON(!path->locks[level]);
917         WARN_ON(btrfs_header_generation(mid) != trans->transid);
918
919         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
920
921         if (level < BTRFS_MAX_LEVEL - 1)
922                 parent = path->nodes[level + 1];
923         pslot = path->slots[level + 1];
924
925         /*
926          * deal with the case where there is only one pointer in the root
927          * by promoting the node below to a root
928          */
929         if (!parent) {
930                 struct extent_buffer *child;
931
932                 if (btrfs_header_nritems(mid) != 1)
933                         return 0;
934
935                 /* promote the child to a root */
936                 child = read_node_slot(root, mid, 0);
937                 BUG_ON(!child);
938                 btrfs_tree_lock(child);
939                 btrfs_set_lock_blocking(child);
940                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
941                 BUG_ON(ret);
942
943                 spin_lock(&root->node_lock);
944                 root->node = child;
945                 spin_unlock(&root->node_lock);
946
947                 ret = btrfs_update_extent_ref(trans, root, child->start,
948                                               mid->start, child->start,
949                                               root->root_key.objectid,
950                                               trans->transid, level - 1);
951                 BUG_ON(ret);
952
953                 add_root_to_dirty_list(root);
954                 btrfs_tree_unlock(child);
955
956                 path->locks[level] = 0;
957                 path->nodes[level] = NULL;
958                 clean_tree_block(trans, root, mid);
959                 btrfs_tree_unlock(mid);
960                 /* once for the path */
961                 free_extent_buffer(mid);
962                 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
963                                         mid->start, root->root_key.objectid,
964                                         btrfs_header_generation(mid),
965                                         level, 1);
966                 /* once for the root ptr */
967                 free_extent_buffer(mid);
968                 return ret;
969         }
970         if (btrfs_header_nritems(mid) >
971             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
972                 return 0;
973
974         if (btrfs_header_nritems(mid) < 2)
975                 err_on_enospc = 1;
976
977         left = read_node_slot(root, parent, pslot - 1);
978         if (left) {
979                 btrfs_tree_lock(left);
980                 btrfs_set_lock_blocking(left);
981                 wret = btrfs_cow_block(trans, root, left,
982                                        parent, pslot - 1, &left, 0);
983                 if (wret) {
984                         ret = wret;
985                         goto enospc;
986                 }
987         }
988         right = read_node_slot(root, parent, pslot + 1);
989         if (right) {
990                 btrfs_tree_lock(right);
991                 btrfs_set_lock_blocking(right);
992                 wret = btrfs_cow_block(trans, root, right,
993                                        parent, pslot + 1, &right, 0);
994                 if (wret) {
995                         ret = wret;
996                         goto enospc;
997                 }
998         }
999
1000         /* first, try to make some room in the middle buffer */
1001         if (left) {
1002                 orig_slot += btrfs_header_nritems(left);
1003                 wret = push_node_left(trans, root, left, mid, 1);
1004                 if (wret < 0)
1005                         ret = wret;
1006                 if (btrfs_header_nritems(mid) < 2)
1007                         err_on_enospc = 1;
1008         }
1009
1010         /*
1011          * then try to empty the right most buffer into the middle
1012          */
1013         if (right) {
1014                 wret = push_node_left(trans, root, mid, right, 1);
1015                 if (wret < 0 && wret != -ENOSPC)
1016                         ret = wret;
1017                 if (btrfs_header_nritems(right) == 0) {
1018                         u64 bytenr = right->start;
1019                         u64 generation = btrfs_header_generation(parent);
1020                         u32 blocksize = right->len;
1021
1022                         clean_tree_block(trans, root, right);
1023                         btrfs_tree_unlock(right);
1024                         free_extent_buffer(right);
1025                         right = NULL;
1026                         wret = del_ptr(trans, root, path, level + 1, pslot +
1027                                        1);
1028                         if (wret)
1029                                 ret = wret;
1030                         wret = btrfs_free_extent(trans, root, bytenr,
1031                                                  blocksize, parent->start,
1032                                                  btrfs_header_owner(parent),
1033                                                  generation, level, 1);
1034                         if (wret)
1035                                 ret = wret;
1036                 } else {
1037                         struct btrfs_disk_key right_key;
1038                         btrfs_node_key(right, &right_key, 0);
1039                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1040                         btrfs_mark_buffer_dirty(parent);
1041                 }
1042         }
1043         if (btrfs_header_nritems(mid) == 1) {
1044                 /*
1045                  * we're not allowed to leave a node with one item in the
1046                  * tree during a delete.  A deletion from lower in the tree
1047                  * could try to delete the only pointer in this node.
1048                  * So, pull some keys from the left.
1049                  * There has to be a left pointer at this point because
1050                  * otherwise we would have pulled some pointers from the
1051                  * right
1052                  */
1053                 BUG_ON(!left);
1054                 wret = balance_node_right(trans, root, mid, left);
1055                 if (wret < 0) {
1056                         ret = wret;
1057                         goto enospc;
1058                 }
1059                 if (wret == 1) {
1060                         wret = push_node_left(trans, root, left, mid, 1);
1061                         if (wret < 0)
1062                                 ret = wret;
1063                 }
1064                 BUG_ON(wret == 1);
1065         }
1066         if (btrfs_header_nritems(mid) == 0) {
1067                 /* we've managed to empty the middle node, drop it */
1068                 u64 root_gen = btrfs_header_generation(parent);
1069                 u64 bytenr = mid->start;
1070                 u32 blocksize = mid->len;
1071
1072                 clean_tree_block(trans, root, mid);
1073                 btrfs_tree_unlock(mid);
1074                 free_extent_buffer(mid);
1075                 mid = NULL;
1076                 wret = del_ptr(trans, root, path, level + 1, pslot);
1077                 if (wret)
1078                         ret = wret;
1079                 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1080                                          parent->start,
1081                                          btrfs_header_owner(parent),
1082                                          root_gen, level, 1);
1083                 if (wret)
1084                         ret = wret;
1085         } else {
1086                 /* update the parent key to reflect our changes */
1087                 struct btrfs_disk_key mid_key;
1088                 btrfs_node_key(mid, &mid_key, 0);
1089                 btrfs_set_node_key(parent, &mid_key, pslot);
1090                 btrfs_mark_buffer_dirty(parent);
1091         }
1092
1093         /* update the path */
1094         if (left) {
1095                 if (btrfs_header_nritems(left) > orig_slot) {
1096                         extent_buffer_get(left);
1097                         /* left was locked after cow */
1098                         path->nodes[level] = left;
1099                         path->slots[level + 1] -= 1;
1100                         path->slots[level] = orig_slot;
1101                         if (mid) {
1102                                 btrfs_tree_unlock(mid);
1103                                 free_extent_buffer(mid);
1104                         }
1105                 } else {
1106                         orig_slot -= btrfs_header_nritems(left);
1107                         path->slots[level] = orig_slot;
1108                 }
1109         }
1110         /* double check we haven't messed things up */
1111         check_block(root, path, level);
1112         if (orig_ptr !=
1113             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1114                 BUG();
1115 enospc:
1116         if (right) {
1117                 btrfs_tree_unlock(right);
1118                 free_extent_buffer(right);
1119         }
1120         if (left) {
1121                 if (path->nodes[level] != left)
1122                         btrfs_tree_unlock(left);
1123                 free_extent_buffer(left);
1124         }
1125         return ret;
1126 }
1127
1128 /* Node balancing for insertion.  Here we only split or push nodes around
1129  * when they are completely full.  This is also done top down, so we
1130  * have to be pessimistic.
1131  */
1132 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1133                                           struct btrfs_root *root,
1134                                           struct btrfs_path *path, int level)
1135 {
1136         struct extent_buffer *right = NULL;
1137         struct extent_buffer *mid;
1138         struct extent_buffer *left = NULL;
1139         struct extent_buffer *parent = NULL;
1140         int ret = 0;
1141         int wret;
1142         int pslot;
1143         int orig_slot = path->slots[level];
1144         u64 orig_ptr;
1145
1146         if (level == 0)
1147                 return 1;
1148
1149         mid = path->nodes[level];
1150         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1151         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1152
1153         if (level < BTRFS_MAX_LEVEL - 1)
1154                 parent = path->nodes[level + 1];
1155         pslot = path->slots[level + 1];
1156
1157         if (!parent)
1158                 return 1;
1159
1160         left = read_node_slot(root, parent, pslot - 1);
1161
1162         /* first, try to make some room in the middle buffer */
1163         if (left) {
1164                 u32 left_nr;
1165
1166                 btrfs_tree_lock(left);
1167                 btrfs_set_lock_blocking(left);
1168
1169                 left_nr = btrfs_header_nritems(left);
1170                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1171                         wret = 1;
1172                 } else {
1173                         ret = btrfs_cow_block(trans, root, left, parent,
1174                                               pslot - 1, &left, 0);
1175                         if (ret)
1176                                 wret = 1;
1177                         else {
1178                                 wret = push_node_left(trans, root,
1179                                                       left, mid, 0);
1180                         }
1181                 }
1182                 if (wret < 0)
1183                         ret = wret;
1184                 if (wret == 0) {
1185                         struct btrfs_disk_key disk_key;
1186                         orig_slot += left_nr;
1187                         btrfs_node_key(mid, &disk_key, 0);
1188                         btrfs_set_node_key(parent, &disk_key, pslot);
1189                         btrfs_mark_buffer_dirty(parent);
1190                         if (btrfs_header_nritems(left) > orig_slot) {
1191                                 path->nodes[level] = left;
1192                                 path->slots[level + 1] -= 1;
1193                                 path->slots[level] = orig_slot;
1194                                 btrfs_tree_unlock(mid);
1195                                 free_extent_buffer(mid);
1196                         } else {
1197                                 orig_slot -=
1198                                         btrfs_header_nritems(left);
1199                                 path->slots[level] = orig_slot;
1200                                 btrfs_tree_unlock(left);
1201                                 free_extent_buffer(left);
1202                         }
1203                         return 0;
1204                 }
1205                 btrfs_tree_unlock(left);
1206                 free_extent_buffer(left);
1207         }
1208         right = read_node_slot(root, parent, pslot + 1);
1209
1210         /*
1211          * then try to empty the right most buffer into the middle
1212          */
1213         if (right) {
1214                 u32 right_nr;
1215
1216                 btrfs_tree_lock(right);
1217                 btrfs_set_lock_blocking(right);
1218
1219                 right_nr = btrfs_header_nritems(right);
1220                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1221                         wret = 1;
1222                 } else {
1223                         ret = btrfs_cow_block(trans, root, right,
1224                                               parent, pslot + 1,
1225                                               &right, 0);
1226                         if (ret)
1227                                 wret = 1;
1228                         else {
1229                                 wret = balance_node_right(trans, root,
1230                                                           right, mid);
1231                         }
1232                 }
1233                 if (wret < 0)
1234                         ret = wret;
1235                 if (wret == 0) {
1236                         struct btrfs_disk_key disk_key;
1237
1238                         btrfs_node_key(right, &disk_key, 0);
1239                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1240                         btrfs_mark_buffer_dirty(parent);
1241
1242                         if (btrfs_header_nritems(mid) <= orig_slot) {
1243                                 path->nodes[level] = right;
1244                                 path->slots[level + 1] += 1;
1245                                 path->slots[level] = orig_slot -
1246                                         btrfs_header_nritems(mid);
1247                                 btrfs_tree_unlock(mid);
1248                                 free_extent_buffer(mid);
1249                         } else {
1250                                 btrfs_tree_unlock(right);
1251                                 free_extent_buffer(right);
1252                         }
1253                         return 0;
1254                 }
1255                 btrfs_tree_unlock(right);
1256                 free_extent_buffer(right);
1257         }
1258         return 1;
1259 }
1260
1261 /*
1262  * readahead one full node of leaves, finding things that are close
1263  * to the block in 'slot', and triggering ra on them.
1264  */
1265 static noinline void reada_for_search(struct btrfs_root *root,
1266                                       struct btrfs_path *path,
1267                                       int level, int slot, u64 objectid)
1268 {
1269         struct extent_buffer *node;
1270         struct btrfs_disk_key disk_key;
1271         u32 nritems;
1272         u64 search;
1273         u64 target;
1274         u64 nread = 0;
1275         int direction = path->reada;
1276         struct extent_buffer *eb;
1277         u32 nr;
1278         u32 blocksize;
1279         u32 nscan = 0;
1280
1281         if (level != 1)
1282                 return;
1283
1284         if (!path->nodes[level])
1285                 return;
1286
1287         node = path->nodes[level];
1288
1289         search = btrfs_node_blockptr(node, slot);
1290         blocksize = btrfs_level_size(root, level - 1);
1291         eb = btrfs_find_tree_block(root, search, blocksize);
1292         if (eb) {
1293                 free_extent_buffer(eb);
1294                 return;
1295         }
1296
1297         target = search;
1298
1299         nritems = btrfs_header_nritems(node);
1300         nr = slot;
1301         while (1) {
1302                 if (direction < 0) {
1303                         if (nr == 0)
1304                                 break;
1305                         nr--;
1306                 } else if (direction > 0) {
1307                         nr++;
1308                         if (nr >= nritems)
1309                                 break;
1310                 }
1311                 if (path->reada < 0 && objectid) {
1312                         btrfs_node_key(node, &disk_key, nr);
1313                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1314                                 break;
1315                 }
1316                 search = btrfs_node_blockptr(node, nr);
1317                 if ((search <= target && target - search <= 65536) ||
1318                     (search > target && search - target <= 65536)) {
1319                         readahead_tree_block(root, search, blocksize,
1320                                      btrfs_node_ptr_generation(node, nr));
1321                         nread += blocksize;
1322                 }
1323                 nscan++;
1324                 if ((nread > 65536 || nscan > 32))
1325                         break;
1326         }
1327 }
1328
1329 /*
1330  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1331  * cache
1332  */
1333 static noinline int reada_for_balance(struct btrfs_root *root,
1334                                       struct btrfs_path *path, int level)
1335 {
1336         int slot;
1337         int nritems;
1338         struct extent_buffer *parent;
1339         struct extent_buffer *eb;
1340         u64 gen;
1341         u64 block1 = 0;
1342         u64 block2 = 0;
1343         int ret = 0;
1344         int blocksize;
1345
1346         parent = path->nodes[level - 1];
1347         if (!parent)
1348                 return 0;
1349
1350         nritems = btrfs_header_nritems(parent);
1351         slot = path->slots[level];
1352         blocksize = btrfs_level_size(root, level);
1353
1354         if (slot > 0) {
1355                 block1 = btrfs_node_blockptr(parent, slot - 1);
1356                 gen = btrfs_node_ptr_generation(parent, slot - 1);
1357                 eb = btrfs_find_tree_block(root, block1, blocksize);
1358                 if (eb && btrfs_buffer_uptodate(eb, gen))
1359                         block1 = 0;
1360                 free_extent_buffer(eb);
1361         }
1362         if (slot < nritems) {
1363                 block2 = btrfs_node_blockptr(parent, slot + 1);
1364                 gen = btrfs_node_ptr_generation(parent, slot + 1);
1365                 eb = btrfs_find_tree_block(root, block2, blocksize);
1366                 if (eb && btrfs_buffer_uptodate(eb, gen))
1367                         block2 = 0;
1368                 free_extent_buffer(eb);
1369         }
1370         if (block1 || block2) {
1371                 ret = -EAGAIN;
1372                 btrfs_release_path(root, path);
1373                 if (block1)
1374                         readahead_tree_block(root, block1, blocksize, 0);
1375                 if (block2)
1376                         readahead_tree_block(root, block2, blocksize, 0);
1377
1378                 if (block1) {
1379                         eb = read_tree_block(root, block1, blocksize, 0);
1380                         free_extent_buffer(eb);
1381                 }
1382                 if (block1) {
1383                         eb = read_tree_block(root, block2, blocksize, 0);
1384                         free_extent_buffer(eb);
1385                 }
1386         }
1387         return ret;
1388 }
1389
1390
1391 /*
1392  * when we walk down the tree, it is usually safe to unlock the higher layers
1393  * in the tree.  The exceptions are when our path goes through slot 0, because
1394  * operations on the tree might require changing key pointers higher up in the
1395  * tree.
1396  *
1397  * callers might also have set path->keep_locks, which tells this code to keep
1398  * the lock if the path points to the last slot in the block.  This is part of
1399  * walking through the tree, and selecting the next slot in the higher block.
1400  *
1401  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1402  * if lowest_unlock is 1, level 0 won't be unlocked
1403  */
1404 static noinline void unlock_up(struct btrfs_path *path, int level,
1405                                int lowest_unlock)
1406 {
1407         int i;
1408         int skip_level = level;
1409         int no_skips = 0;
1410         struct extent_buffer *t;
1411
1412         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1413                 if (!path->nodes[i])
1414                         break;
1415                 if (!path->locks[i])
1416                         break;
1417                 if (!no_skips && path->slots[i] == 0) {
1418                         skip_level = i + 1;
1419                         continue;
1420                 }
1421                 if (!no_skips && path->keep_locks) {
1422                         u32 nritems;
1423                         t = path->nodes[i];
1424                         nritems = btrfs_header_nritems(t);
1425                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1426                                 skip_level = i + 1;
1427                                 continue;
1428                         }
1429                 }
1430                 if (skip_level < i && i >= lowest_unlock)
1431                         no_skips = 1;
1432
1433                 t = path->nodes[i];
1434                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1435                         btrfs_tree_unlock(t);
1436                         path->locks[i] = 0;
1437                 }
1438         }
1439 }
1440
1441 /*
1442  * This releases any locks held in the path starting at level and
1443  * going all the way up to the root.
1444  *
1445  * btrfs_search_slot will keep the lock held on higher nodes in a few
1446  * corner cases, such as COW of the block at slot zero in the node.  This
1447  * ignores those rules, and it should only be called when there are no
1448  * more updates to be done higher up in the tree.
1449  */
1450 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1451 {
1452         int i;
1453
1454         if (path->keep_locks || path->lowest_level)
1455                 return;
1456
1457         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1458                 if (!path->nodes[i])
1459                         continue;
1460                 if (!path->locks[i])
1461                         continue;
1462                 btrfs_tree_unlock(path->nodes[i]);
1463                 path->locks[i] = 0;
1464         }
1465 }
1466
1467 /*
1468  * look for key in the tree.  path is filled in with nodes along the way
1469  * if key is found, we return zero and you can find the item in the leaf
1470  * level of the path (level 0)
1471  *
1472  * If the key isn't found, the path points to the slot where it should
1473  * be inserted, and 1 is returned.  If there are other errors during the
1474  * search a negative error number is returned.
1475  *
1476  * if ins_len > 0, nodes and leaves will be split as we walk down the
1477  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1478  * possible)
1479  */
1480 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1481                       *root, struct btrfs_key *key, struct btrfs_path *p, int
1482                       ins_len, int cow)
1483 {
1484         struct extent_buffer *b;
1485         struct extent_buffer *tmp;
1486         int slot;
1487         int ret;
1488         int level;
1489         int should_reada = p->reada;
1490         int lowest_unlock = 1;
1491         int blocksize;
1492         u8 lowest_level = 0;
1493         u64 blocknr;
1494         u64 gen;
1495         struct btrfs_key prealloc_block;
1496
1497         lowest_level = p->lowest_level;
1498         WARN_ON(lowest_level && ins_len > 0);
1499         WARN_ON(p->nodes[0] != NULL);
1500
1501         if (ins_len < 0)
1502                 lowest_unlock = 2;
1503
1504         prealloc_block.objectid = 0;
1505
1506 again:
1507         if (p->skip_locking)
1508                 b = btrfs_root_node(root);
1509         else
1510                 b = btrfs_lock_root_node(root);
1511
1512         while (b) {
1513                 level = btrfs_header_level(b);
1514
1515                 /*
1516                  * setup the path here so we can release it under lock
1517                  * contention with the cow code
1518                  */
1519                 p->nodes[level] = b;
1520                 if (!p->skip_locking)
1521                         p->locks[level] = 1;
1522
1523                 if (cow) {
1524                         int wret;
1525
1526                         /* is a cow on this block not required */
1527                         if (btrfs_header_generation(b) == trans->transid &&
1528                             btrfs_header_owner(b) == root->root_key.objectid &&
1529                             !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1530                                 goto cow_done;
1531                         }
1532
1533                         /* ok, we have to cow, is our old prealloc the right
1534                          * size?
1535                          */
1536                         if (prealloc_block.objectid &&
1537                             prealloc_block.offset != b->len) {
1538                                 btrfs_release_path(root, p);
1539                                 btrfs_free_reserved_extent(root,
1540                                            prealloc_block.objectid,
1541                                            prealloc_block.offset);
1542                                 prealloc_block.objectid = 0;
1543                                 goto again;
1544                         }
1545
1546                         /*
1547                          * for higher level blocks, try not to allocate blocks
1548                          * with the block and the parent locks held.
1549                          */
1550                         if (level > 0 && !prealloc_block.objectid) {
1551                                 u32 size = b->len;
1552                                 u64 hint = b->start;
1553
1554                                 btrfs_release_path(root, p);
1555                                 ret = btrfs_reserve_extent(trans, root,
1556                                                            size, size, 0,
1557                                                            hint, (u64)-1,
1558                                                            &prealloc_block, 0);
1559                                 BUG_ON(ret);
1560                                 goto again;
1561                         }
1562
1563                         btrfs_set_path_blocking(p);
1564
1565                         wret = btrfs_cow_block(trans, root, b,
1566                                                p->nodes[level + 1],
1567                                                p->slots[level + 1],
1568                                                &b, prealloc_block.objectid);
1569                         prealloc_block.objectid = 0;
1570                         if (wret) {
1571                                 free_extent_buffer(b);
1572                                 ret = wret;
1573                                 goto done;
1574                         }
1575                 }
1576 cow_done:
1577                 BUG_ON(!cow && ins_len);
1578                 if (level != btrfs_header_level(b))
1579                         WARN_ON(1);
1580                 level = btrfs_header_level(b);
1581
1582                 p->nodes[level] = b;
1583                 if (!p->skip_locking)
1584                         p->locks[level] = 1;
1585
1586                 btrfs_clear_path_blocking(p, NULL);
1587
1588                 /*
1589                  * we have a lock on b and as long as we aren't changing
1590                  * the tree, there is no way to for the items in b to change.
1591                  * It is safe to drop the lock on our parent before we
1592                  * go through the expensive btree search on b.
1593                  *
1594                  * If cow is true, then we might be changing slot zero,
1595                  * which may require changing the parent.  So, we can't
1596                  * drop the lock until after we know which slot we're
1597                  * operating on.
1598                  */
1599                 if (!cow)
1600                         btrfs_unlock_up_safe(p, level + 1);
1601
1602                 ret = check_block(root, p, level);
1603                 if (ret) {
1604                         ret = -1;
1605                         goto done;
1606                 }
1607
1608                 ret = bin_search(b, key, level, &slot);
1609
1610                 if (level != 0) {
1611                         if (ret && slot > 0)
1612                                 slot -= 1;
1613                         p->slots[level] = slot;
1614                         if ((p->search_for_split || ins_len > 0) &&
1615                             btrfs_header_nritems(b) >=
1616                             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1617                                 int sret;
1618
1619                                 sret = reada_for_balance(root, p, level);
1620                                 if (sret)
1621                                         goto again;
1622
1623                                 btrfs_set_path_blocking(p);
1624                                 sret = split_node(trans, root, p, level);
1625                                 btrfs_clear_path_blocking(p, NULL);
1626
1627                                 BUG_ON(sret > 0);
1628                                 if (sret) {
1629                                         ret = sret;
1630                                         goto done;
1631                                 }
1632                                 b = p->nodes[level];
1633                                 slot = p->slots[level];
1634                         } else if (ins_len < 0 &&
1635                                    btrfs_header_nritems(b) <
1636                                    BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637                                 int sret;
1638
1639                                 sret = reada_for_balance(root, p, level);
1640                                 if (sret)
1641                                         goto again;
1642
1643                                 btrfs_set_path_blocking(p);
1644                                 sret = balance_level(trans, root, p, level);
1645                                 btrfs_clear_path_blocking(p, NULL);
1646
1647                                 if (sret) {
1648                                         ret = sret;
1649                                         goto done;
1650                                 }
1651                                 b = p->nodes[level];
1652                                 if (!b) {
1653                                         btrfs_release_path(NULL, p);
1654                                         goto again;
1655                                 }
1656                                 slot = p->slots[level];
1657                                 BUG_ON(btrfs_header_nritems(b) == 1);
1658                         }
1659                         unlock_up(p, level, lowest_unlock);
1660
1661                         /* this is only true while dropping a snapshot */
1662                         if (level == lowest_level) {
1663                                 ret = 0;
1664                                 goto done;
1665                         }
1666
1667                         blocknr = btrfs_node_blockptr(b, slot);
1668                         gen = btrfs_node_ptr_generation(b, slot);
1669                         blocksize = btrfs_level_size(root, level - 1);
1670
1671                         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1672                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1673                                 b = tmp;
1674                         } else {
1675                                 /*
1676                                  * reduce lock contention at high levels
1677                                  * of the btree by dropping locks before
1678                                  * we read.
1679                                  */
1680                                 if (level > 0) {
1681                                         btrfs_release_path(NULL, p);
1682                                         if (tmp)
1683                                                 free_extent_buffer(tmp);
1684                                         if (should_reada)
1685                                                 reada_for_search(root, p,
1686                                                                  level, slot,
1687                                                                  key->objectid);
1688
1689                                         tmp = read_tree_block(root, blocknr,
1690                                                          blocksize, gen);
1691                                         if (tmp)
1692                                                 free_extent_buffer(tmp);
1693                                         goto again;
1694                                 } else {
1695                                         btrfs_set_path_blocking(p);
1696                                         if (tmp)
1697                                                 free_extent_buffer(tmp);
1698                                         if (should_reada)
1699                                                 reada_for_search(root, p,
1700                                                                  level, slot,
1701                                                                  key->objectid);
1702                                         b = read_node_slot(root, b, slot);
1703                                 }
1704                         }
1705                         if (!p->skip_locking) {
1706                                 int lret;
1707
1708                                 btrfs_clear_path_blocking(p, NULL);
1709                                 lret = btrfs_try_spin_lock(b);
1710
1711                                 if (!lret) {
1712                                         btrfs_set_path_blocking(p);
1713                                         btrfs_tree_lock(b);
1714                                         btrfs_clear_path_blocking(p, b);
1715                                 }
1716                         }
1717                 } else {
1718                         p->slots[level] = slot;
1719                         if (ins_len > 0 &&
1720                             btrfs_leaf_free_space(root, b) < ins_len) {
1721                                 int sret;
1722
1723                                 btrfs_set_path_blocking(p);
1724                                 sret = split_leaf(trans, root, key,
1725                                                       p, ins_len, ret == 0);
1726                                 btrfs_clear_path_blocking(p, NULL);
1727
1728                                 BUG_ON(sret > 0);
1729                                 if (sret) {
1730                                         ret = sret;
1731                                         goto done;
1732                                 }
1733                         }
1734                         if (!p->search_for_split)
1735                                 unlock_up(p, level, lowest_unlock);
1736                         goto done;
1737                 }
1738         }
1739         ret = 1;
1740 done:
1741         /*
1742          * we don't really know what they plan on doing with the path
1743          * from here on, so for now just mark it as blocking
1744          */
1745         btrfs_set_path_blocking(p);
1746         if (prealloc_block.objectid) {
1747                 btrfs_free_reserved_extent(root,
1748                            prealloc_block.objectid,
1749                            prealloc_block.offset);
1750         }
1751         return ret;
1752 }
1753
1754 int btrfs_merge_path(struct btrfs_trans_handle *trans,
1755                      struct btrfs_root *root,
1756                      struct btrfs_key *node_keys,
1757                      u64 *nodes, int lowest_level)
1758 {
1759         struct extent_buffer *eb;
1760         struct extent_buffer *parent;
1761         struct btrfs_key key;
1762         u64 bytenr;
1763         u64 generation;
1764         u32 blocksize;
1765         int level;
1766         int slot;
1767         int key_match;
1768         int ret;
1769
1770         eb = btrfs_lock_root_node(root);
1771         ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1772         BUG_ON(ret);
1773
1774         btrfs_set_lock_blocking(eb);
1775
1776         parent = eb;
1777         while (1) {
1778                 level = btrfs_header_level(parent);
1779                 if (level == 0 || level <= lowest_level)
1780                         break;
1781
1782                 ret = bin_search(parent, &node_keys[lowest_level], level,
1783                                  &slot);
1784                 if (ret && slot > 0)
1785                         slot--;
1786
1787                 bytenr = btrfs_node_blockptr(parent, slot);
1788                 if (nodes[level - 1] == bytenr)
1789                         break;
1790
1791                 blocksize = btrfs_level_size(root, level - 1);
1792                 generation = btrfs_node_ptr_generation(parent, slot);
1793                 btrfs_node_key_to_cpu(eb, &key, slot);
1794                 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1795
1796                 if (generation == trans->transid) {
1797                         eb = read_tree_block(root, bytenr, blocksize,
1798                                              generation);
1799                         btrfs_tree_lock(eb);
1800                         btrfs_set_lock_blocking(eb);
1801                 }
1802
1803                 /*
1804                  * if node keys match and node pointer hasn't been modified
1805                  * in the running transaction, we can merge the path. for
1806                  * blocks owened by reloc trees, the node pointer check is
1807                  * skipped, this is because these blocks are fully controlled
1808                  * by the space balance code, no one else can modify them.
1809                  */
1810                 if (!nodes[level - 1] || !key_match ||
1811                     (generation == trans->transid &&
1812                      btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1813                         if (level == 1 || level == lowest_level + 1) {
1814                                 if (generation == trans->transid) {
1815                                         btrfs_tree_unlock(eb);
1816                                         free_extent_buffer(eb);
1817                                 }
1818                                 break;
1819                         }
1820
1821                         if (generation != trans->transid) {
1822                                 eb = read_tree_block(root, bytenr, blocksize,
1823                                                 generation);
1824                                 btrfs_tree_lock(eb);
1825                                 btrfs_set_lock_blocking(eb);
1826                         }
1827
1828                         ret = btrfs_cow_block(trans, root, eb, parent, slot,
1829                                               &eb, 0);
1830                         BUG_ON(ret);
1831
1832                         if (root->root_key.objectid ==
1833                             BTRFS_TREE_RELOC_OBJECTID) {
1834                                 if (!nodes[level - 1]) {
1835                                         nodes[level - 1] = eb->start;
1836                                         memcpy(&node_keys[level - 1], &key,
1837                                                sizeof(node_keys[0]));
1838                                 } else {
1839                                         WARN_ON(1);
1840                                 }
1841                         }
1842
1843                         btrfs_tree_unlock(parent);
1844                         free_extent_buffer(parent);
1845                         parent = eb;
1846                         continue;
1847                 }
1848
1849                 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1850                 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1851                 btrfs_mark_buffer_dirty(parent);
1852
1853                 ret = btrfs_inc_extent_ref(trans, root,
1854                                         nodes[level - 1],
1855                                         blocksize, parent->start,
1856                                         btrfs_header_owner(parent),
1857                                         btrfs_header_generation(parent),
1858                                         level - 1);
1859                 BUG_ON(ret);
1860
1861                 /*
1862                  * If the block was created in the running transaction,
1863                  * it's possible this is the last reference to it, so we
1864                  * should drop the subtree.
1865                  */
1866                 if (generation == trans->transid) {
1867                         ret = btrfs_drop_subtree(trans, root, eb, parent);
1868                         BUG_ON(ret);
1869                         btrfs_tree_unlock(eb);
1870                         free_extent_buffer(eb);
1871                 } else {
1872                         ret = btrfs_free_extent(trans, root, bytenr,
1873                                         blocksize, parent->start,
1874                                         btrfs_header_owner(parent),
1875                                         btrfs_header_generation(parent),
1876                                         level - 1, 1);
1877                         BUG_ON(ret);
1878                 }
1879                 break;
1880         }
1881         btrfs_tree_unlock(parent);
1882         free_extent_buffer(parent);
1883         return 0;
1884 }
1885
1886 /*
1887  * adjust the pointers going up the tree, starting at level
1888  * making sure the right key of each node is points to 'key'.
1889  * This is used after shifting pointers to the left, so it stops
1890  * fixing up pointers when a given leaf/node is not in slot 0 of the
1891  * higher levels
1892  *
1893  * If this fails to write a tree block, it returns -1, but continues
1894  * fixing up the blocks in ram so the tree is consistent.
1895  */
1896 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1897                           struct btrfs_root *root, struct btrfs_path *path,
1898                           struct btrfs_disk_key *key, int level)
1899 {
1900         int i;
1901         int ret = 0;
1902         struct extent_buffer *t;
1903
1904         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1905                 int tslot = path->slots[i];
1906                 if (!path->nodes[i])
1907                         break;
1908                 t = path->nodes[i];
1909                 btrfs_set_node_key(t, key, tslot);
1910                 btrfs_mark_buffer_dirty(path->nodes[i]);
1911                 if (tslot != 0)
1912                         break;
1913         }
1914         return ret;
1915 }
1916
1917 /*
1918  * update item key.
1919  *
1920  * This function isn't completely safe. It's the caller's responsibility
1921  * that the new key won't break the order
1922  */
1923 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1924                             struct btrfs_root *root, struct btrfs_path *path,
1925                             struct btrfs_key *new_key)
1926 {
1927         struct btrfs_disk_key disk_key;
1928         struct extent_buffer *eb;
1929         int slot;
1930
1931         eb = path->nodes[0];
1932         slot = path->slots[0];
1933         if (slot > 0) {
1934                 btrfs_item_key(eb, &disk_key, slot - 1);
1935                 if (comp_keys(&disk_key, new_key) >= 0)
1936                         return -1;
1937         }
1938         if (slot < btrfs_header_nritems(eb) - 1) {
1939                 btrfs_item_key(eb, &disk_key, slot + 1);
1940                 if (comp_keys(&disk_key, new_key) <= 0)
1941                         return -1;
1942         }
1943
1944         btrfs_cpu_key_to_disk(&disk_key, new_key);
1945         btrfs_set_item_key(eb, &disk_key, slot);
1946         btrfs_mark_buffer_dirty(eb);
1947         if (slot == 0)
1948                 fixup_low_keys(trans, root, path, &disk_key, 1);
1949         return 0;
1950 }
1951
1952 /*
1953  * try to push data from one node into the next node left in the
1954  * tree.
1955  *
1956  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1957  * error, and > 0 if there was no room in the left hand block.
1958  */
1959 static int push_node_left(struct btrfs_trans_handle *trans,
1960                           struct btrfs_root *root, struct extent_buffer *dst,
1961                           struct extent_buffer *src, int empty)
1962 {
1963         int push_items = 0;
1964         int src_nritems;
1965         int dst_nritems;
1966         int ret = 0;
1967
1968         src_nritems = btrfs_header_nritems(src);
1969         dst_nritems = btrfs_header_nritems(dst);
1970         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1971         WARN_ON(btrfs_header_generation(src) != trans->transid);
1972         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1973
1974         if (!empty && src_nritems <= 8)
1975                 return 1;
1976
1977         if (push_items <= 0)
1978                 return 1;
1979
1980         if (empty) {
1981                 push_items = min(src_nritems, push_items);
1982                 if (push_items < src_nritems) {
1983                         /* leave at least 8 pointers in the node if
1984                          * we aren't going to empty it
1985                          */
1986                         if (src_nritems - push_items < 8) {
1987                                 if (push_items <= 8)
1988                                         return 1;
1989                                 push_items -= 8;
1990                         }
1991                 }
1992         } else
1993                 push_items = min(src_nritems - 8, push_items);
1994
1995         copy_extent_buffer(dst, src,
1996                            btrfs_node_key_ptr_offset(dst_nritems),
1997                            btrfs_node_key_ptr_offset(0),
1998                            push_items * sizeof(struct btrfs_key_ptr));
1999
2000         if (push_items < src_nritems) {
2001                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2002                                       btrfs_node_key_ptr_offset(push_items),
2003                                       (src_nritems - push_items) *
2004                                       sizeof(struct btrfs_key_ptr));
2005         }
2006         btrfs_set_header_nritems(src, src_nritems - push_items);
2007         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2008         btrfs_mark_buffer_dirty(src);
2009         btrfs_mark_buffer_dirty(dst);
2010
2011         ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
2012         BUG_ON(ret);
2013
2014         return ret;
2015 }
2016
2017 /*
2018  * try to push data from one node into the next node right in the
2019  * tree.
2020  *
2021  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2022  * error, and > 0 if there was no room in the right hand block.
2023  *
2024  * this will  only push up to 1/2 the contents of the left node over
2025  */
2026 static int balance_node_right(struct btrfs_trans_handle *trans,
2027                               struct btrfs_root *root,
2028                               struct extent_buffer *dst,
2029                               struct extent_buffer *src)
2030 {
2031         int push_items = 0;
2032         int max_push;
2033         int src_nritems;
2034         int dst_nritems;
2035         int ret = 0;
2036
2037         WARN_ON(btrfs_header_generation(src) != trans->transid);
2038         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2039
2040         src_nritems = btrfs_header_nritems(src);
2041         dst_nritems = btrfs_header_nritems(dst);
2042         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2043         if (push_items <= 0)
2044                 return 1;
2045
2046         if (src_nritems < 4)
2047                 return 1;
2048
2049         max_push = src_nritems / 2 + 1;
2050         /* don't try to empty the node */
2051         if (max_push >= src_nritems)
2052                 return 1;
2053
2054         if (max_push < push_items)
2055                 push_items = max_push;
2056
2057         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2058                                       btrfs_node_key_ptr_offset(0),
2059                                       (dst_nritems) *
2060                                       sizeof(struct btrfs_key_ptr));
2061
2062         copy_extent_buffer(dst, src,
2063                            btrfs_node_key_ptr_offset(0),
2064                            btrfs_node_key_ptr_offset(src_nritems - push_items),
2065                            push_items * sizeof(struct btrfs_key_ptr));
2066
2067         btrfs_set_header_nritems(src, src_nritems - push_items);
2068         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2069
2070         btrfs_mark_buffer_dirty(src);
2071         btrfs_mark_buffer_dirty(dst);
2072
2073         ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
2074         BUG_ON(ret);
2075
2076         return ret;
2077 }
2078
2079 /*
2080  * helper function to insert a new root level in the tree.
2081  * A new node is allocated, and a single item is inserted to
2082  * point to the existing root
2083  *
2084  * returns zero on success or < 0 on failure.
2085  */
2086 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2087                            struct btrfs_root *root,
2088                            struct btrfs_path *path, int level)
2089 {
2090         u64 lower_gen;
2091         struct extent_buffer *lower;
2092         struct extent_buffer *c;
2093         struct extent_buffer *old;
2094         struct btrfs_disk_key lower_key;
2095         int ret;
2096
2097         BUG_ON(path->nodes[level]);
2098         BUG_ON(path->nodes[level-1] != root->node);
2099
2100         lower = path->nodes[level-1];
2101         if (level == 1)
2102                 btrfs_item_key(lower, &lower_key, 0);
2103         else
2104                 btrfs_node_key(lower, &lower_key, 0);
2105
2106         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2107                                    root->root_key.objectid, trans->transid,
2108                                    level, root->node->start, 0);
2109         if (IS_ERR(c))
2110                 return PTR_ERR(c);
2111
2112         memset_extent_buffer(c, 0, 0, root->nodesize);
2113         btrfs_set_header_nritems(c, 1);
2114         btrfs_set_header_level(c, level);
2115         btrfs_set_header_bytenr(c, c->start);
2116         btrfs_set_header_generation(c, trans->transid);
2117         btrfs_set_header_owner(c, root->root_key.objectid);
2118
2119         write_extent_buffer(c, root->fs_info->fsid,
2120                             (unsigned long)btrfs_header_fsid(c),
2121                             BTRFS_FSID_SIZE);
2122
2123         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2124                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
2125                             BTRFS_UUID_SIZE);
2126
2127         btrfs_set_node_key(c, &lower_key, 0);
2128         btrfs_set_node_blockptr(c, 0, lower->start);
2129         lower_gen = btrfs_header_generation(lower);
2130         WARN_ON(lower_gen != trans->transid);
2131
2132         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2133
2134         btrfs_mark_buffer_dirty(c);
2135
2136         spin_lock(&root->node_lock);
2137         old = root->node;
2138         root->node = c;
2139         spin_unlock(&root->node_lock);
2140
2141         ret = btrfs_update_extent_ref(trans, root, lower->start,
2142                                       lower->start, c->start,
2143                                       root->root_key.objectid,
2144                                       trans->transid, level - 1);
2145         BUG_ON(ret);
2146
2147         /* the super has an extra ref to root->node */
2148         free_extent_buffer(old);
2149
2150         add_root_to_dirty_list(root);
2151         extent_buffer_get(c);
2152         path->nodes[level] = c;
2153         path->locks[level] = 1;
2154         path->slots[level] = 0;
2155         return 0;
2156 }
2157
2158 /*
2159  * worker function to insert a single pointer in a node.
2160  * the node should have enough room for the pointer already
2161  *
2162  * slot and level indicate where you want the key to go, and
2163  * blocknr is the block the key points to.
2164  *
2165  * returns zero on success and < 0 on any error
2166  */
2167 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2168                       *root, struct btrfs_path *path, struct btrfs_disk_key
2169                       *key, u64 bytenr, int slot, int level)
2170 {
2171         struct extent_buffer *lower;
2172         int nritems;
2173
2174         BUG_ON(!path->nodes[level]);
2175         lower = path->nodes[level];
2176         nritems = btrfs_header_nritems(lower);
2177         if (slot > nritems)
2178                 BUG();
2179         if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2180                 BUG();
2181         if (slot != nritems) {
2182                 memmove_extent_buffer(lower,
2183                               btrfs_node_key_ptr_offset(slot + 1),
2184                               btrfs_node_key_ptr_offset(slot),
2185                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2186         }
2187         btrfs_set_node_key(lower, key, slot);
2188         btrfs_set_node_blockptr(lower, slot, bytenr);
2189         WARN_ON(trans->transid == 0);
2190         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2191         btrfs_set_header_nritems(lower, nritems + 1);
2192         btrfs_mark_buffer_dirty(lower);
2193         return 0;
2194 }
2195
2196 /*
2197  * split the node at the specified level in path in two.
2198  * The path is corrected to point to the appropriate node after the split
2199  *
2200  * Before splitting this tries to make some room in the node by pushing
2201  * left and right, if either one works, it returns right away.
2202  *
2203  * returns 0 on success and < 0 on failure
2204  */
2205 static noinline int split_node(struct btrfs_trans_handle *trans,
2206                                struct btrfs_root *root,
2207                                struct btrfs_path *path, int level)
2208 {
2209         struct extent_buffer *c;
2210         struct extent_buffer *split;
2211         struct btrfs_disk_key disk_key;
2212         int mid;
2213         int ret;
2214         int wret;
2215         u32 c_nritems;
2216
2217         c = path->nodes[level];
2218         WARN_ON(btrfs_header_generation(c) != trans->transid);
2219         if (c == root->node) {
2220                 /* trying to split the root, lets make a new one */
2221                 ret = insert_new_root(trans, root, path, level + 1);
2222                 if (ret)
2223                         return ret;
2224         } else {
2225                 ret = push_nodes_for_insert(trans, root, path, level);
2226                 c = path->nodes[level];
2227                 if (!ret && btrfs_header_nritems(c) <
2228                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2229                         return 0;
2230                 if (ret < 0)
2231                         return ret;
2232         }
2233
2234         c_nritems = btrfs_header_nritems(c);
2235
2236         split = btrfs_alloc_free_block(trans, root, root->nodesize,
2237                                         path->nodes[level + 1]->start,
2238                                         root->root_key.objectid,
2239                                         trans->transid, level, c->start, 0);
2240         if (IS_ERR(split))
2241                 return PTR_ERR(split);
2242
2243         btrfs_set_header_flags(split, btrfs_header_flags(c));
2244         btrfs_set_header_level(split, btrfs_header_level(c));
2245         btrfs_set_header_bytenr(split, split->start);
2246         btrfs_set_header_generation(split, trans->transid);
2247         btrfs_set_header_owner(split, root->root_key.objectid);
2248         btrfs_set_header_flags(split, 0);
2249         write_extent_buffer(split, root->fs_info->fsid,
2250                             (unsigned long)btrfs_header_fsid(split),
2251                             BTRFS_FSID_SIZE);
2252         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2253                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
2254                             BTRFS_UUID_SIZE);
2255
2256         mid = (c_nritems + 1) / 2;
2257
2258         copy_extent_buffer(split, c,
2259                            btrfs_node_key_ptr_offset(0),
2260                            btrfs_node_key_ptr_offset(mid),
2261                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2262         btrfs_set_header_nritems(split, c_nritems - mid);
2263         btrfs_set_header_nritems(c, mid);
2264         ret = 0;
2265
2266         btrfs_mark_buffer_dirty(c);
2267         btrfs_mark_buffer_dirty(split);
2268
2269         btrfs_node_key(split, &disk_key, 0);
2270         wret = insert_ptr(trans, root, path, &disk_key, split->start,
2271                           path->slots[level + 1] + 1,
2272                           level + 1);
2273         if (wret)
2274                 ret = wret;
2275
2276         ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2277         BUG_ON(ret);
2278
2279         if (path->slots[level] >= mid) {
2280                 path->slots[level] -= mid;
2281                 btrfs_tree_unlock(c);
2282                 free_extent_buffer(c);
2283                 path->nodes[level] = split;
2284                 path->slots[level + 1] += 1;
2285         } else {
2286                 btrfs_tree_unlock(split);
2287                 free_extent_buffer(split);
2288         }
2289         return ret;
2290 }
2291
2292 /*
2293  * how many bytes are required to store the items in a leaf.  start
2294  * and nr indicate which items in the leaf to check.  This totals up the
2295  * space used both by the item structs and the item data
2296  */
2297 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2298 {
2299         int data_len;
2300         int nritems = btrfs_header_nritems(l);
2301         int end = min(nritems, start + nr) - 1;
2302
2303         if (!nr)
2304                 return 0;
2305         data_len = btrfs_item_end_nr(l, start);
2306         data_len = data_len - btrfs_item_offset_nr(l, end);
2307         data_len += sizeof(struct btrfs_item) * nr;
2308         WARN_ON(data_len < 0);
2309         return data_len;
2310 }
2311
2312 /*
2313  * The space between the end of the leaf items and
2314  * the start of the leaf data.  IOW, how much room
2315  * the leaf has left for both items and data
2316  */
2317 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2318                                    struct extent_buffer *leaf)
2319 {
2320         int nritems = btrfs_header_nritems(leaf);
2321         int ret;
2322         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2323         if (ret < 0) {
2324                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2325                        "used %d nritems %d\n",
2326                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2327                        leaf_space_used(leaf, 0, nritems), nritems);
2328         }
2329         return ret;
2330 }
2331
2332 /*
2333  * push some data in the path leaf to the right, trying to free up at
2334  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2335  *
2336  * returns 1 if the push failed because the other node didn't have enough
2337  * room, 0 if everything worked out and < 0 if there were major errors.
2338  */
2339 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2340                            *root, struct btrfs_path *path, int data_size,
2341                            int empty)
2342 {
2343         struct extent_buffer *left = path->nodes[0];
2344         struct extent_buffer *right;
2345         struct extent_buffer *upper;
2346         struct btrfs_disk_key disk_key;
2347         int slot;
2348         u32 i;
2349         int free_space;
2350         int push_space = 0;
2351         int push_items = 0;
2352         struct btrfs_item *item;
2353         u32 left_nritems;
2354         u32 nr;
2355         u32 right_nritems;
2356         u32 data_end;
2357         u32 this_item_size;
2358         int ret;
2359
2360         slot = path->slots[1];
2361         if (!path->nodes[1])
2362                 return 1;
2363
2364         upper = path->nodes[1];
2365         if (slot >= btrfs_header_nritems(upper) - 1)
2366                 return 1;
2367
2368         btrfs_assert_tree_locked(path->nodes[1]);
2369
2370         right = read_node_slot(root, upper, slot + 1);
2371         btrfs_tree_lock(right);
2372         btrfs_set_lock_blocking(right);
2373
2374         free_space = btrfs_leaf_free_space(root, right);
2375         if (free_space < data_size)
2376                 goto out_unlock;
2377
2378         /* cow and double check */
2379         ret = btrfs_cow_block(trans, root, right, upper,
2380                               slot + 1, &right, 0);
2381         if (ret)
2382                 goto out_unlock;
2383
2384         free_space = btrfs_leaf_free_space(root, right);
2385         if (free_space < data_size)
2386                 goto out_unlock;
2387
2388         left_nritems = btrfs_header_nritems(left);
2389         if (left_nritems == 0)
2390                 goto out_unlock;
2391
2392         if (empty)
2393                 nr = 0;
2394         else
2395                 nr = 1;
2396
2397         if (path->slots[0] >= left_nritems)
2398                 push_space += data_size;
2399
2400         i = left_nritems - 1;
2401         while (i >= nr) {
2402                 item = btrfs_item_nr(left, i);
2403
2404                 if (!empty && push_items > 0) {
2405                         if (path->slots[0] > i)
2406                                 break;
2407                         if (path->slots[0] == i) {
2408                                 int space = btrfs_leaf_free_space(root, left);
2409                                 if (space + push_space * 2 > free_space)
2410                                         break;
2411                         }
2412                 }
2413
2414                 if (path->slots[0] == i)
2415                         push_space += data_size;
2416
2417                 if (!left->map_token) {
2418                         map_extent_buffer(left, (unsigned long)item,
2419                                         sizeof(struct btrfs_item),
2420                                         &left->map_token, &left->kaddr,
2421                                         &left->map_start, &left->map_len,
2422                                         KM_USER1);
2423                 }
2424
2425                 this_item_size = btrfs_item_size(left, item);
2426                 if (this_item_size + sizeof(*item) + push_space > free_space)
2427                         break;
2428
2429                 push_items++;
2430                 push_space += this_item_size + sizeof(*item);
2431                 if (i == 0)
2432                         break;
2433                 i--;
2434         }
2435         if (left->map_token) {
2436                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2437                 left->map_token = NULL;
2438         }
2439
2440         if (push_items == 0)
2441                 goto out_unlock;
2442
2443         if (!empty && push_items == left_nritems)
2444                 WARN_ON(1);
2445
2446         /* push left to right */
2447         right_nritems = btrfs_header_nritems(right);
2448
2449         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2450         push_space -= leaf_data_end(root, left);
2451
2452         /* make room in the right data area */
2453         data_end = leaf_data_end(root, right);
2454         memmove_extent_buffer(right,
2455                               btrfs_leaf_data(right) + data_end - push_space,
2456                               btrfs_leaf_data(right) + data_end,
2457                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
2458
2459         /* copy from the left data area */
2460         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2461                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2462                      btrfs_leaf_data(left) + leaf_data_end(root, left),
2463                      push_space);
2464
2465         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2466                               btrfs_item_nr_offset(0),
2467                               right_nritems * sizeof(struct btrfs_item));
2468
2469         /* copy the items from left to right */
2470         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2471                    btrfs_item_nr_offset(left_nritems - push_items),
2472                    push_items * sizeof(struct btrfs_item));
2473
2474         /* update the item pointers */
2475         right_nritems += push_items;
2476         btrfs_set_header_nritems(right, right_nritems);
2477         push_space = BTRFS_LEAF_DATA_SIZE(root);
2478         for (i = 0; i < right_nritems; i++) {
2479                 item = btrfs_item_nr(right, i);
2480                 if (!right->map_token) {
2481                         map_extent_buffer(right, (unsigned long)item,
2482                                         sizeof(struct btrfs_item),
2483                                         &right->map_token, &right->kaddr,
2484                                         &right->map_start, &right->map_len,
2485                                         KM_USER1);
2486                 }
2487                 push_space -= btrfs_item_size(right, item);
2488                 btrfs_set_item_offset(right, item, push_space);
2489         }
2490
2491         if (right->map_token) {
2492                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2493                 right->map_token = NULL;
2494         }
2495         left_nritems -= push_items;
2496         btrfs_set_header_nritems(left, left_nritems);
2497
2498         if (left_nritems)
2499                 btrfs_mark_buffer_dirty(left);
2500         btrfs_mark_buffer_dirty(right);
2501
2502         ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2503         BUG_ON(ret);
2504
2505         btrfs_item_key(right, &disk_key, 0);
2506         btrfs_set_node_key(upper, &disk_key, slot + 1);
2507         btrfs_mark_buffer_dirty(upper);
2508
2509         /* then fixup the leaf pointer in the path */
2510         if (path->slots[0] >= left_nritems) {
2511                 path->slots[0] -= left_nritems;
2512                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2513                         clean_tree_block(trans, root, path->nodes[0]);
2514                 btrfs_tree_unlock(path->nodes[0]);
2515                 free_extent_buffer(path->nodes[0]);
2516                 path->nodes[0] = right;
2517                 path->slots[1] += 1;
2518         } else {
2519                 btrfs_tree_unlock(right);
2520                 free_extent_buffer(right);
2521         }
2522         return 0;
2523
2524 out_unlock:
2525         btrfs_tree_unlock(right);
2526         free_extent_buffer(right);
2527         return 1;
2528 }
2529
2530 /*
2531  * push some data in the path leaf to the left, trying to free up at
2532  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2533  */
2534 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2535                           *root, struct btrfs_path *path, int data_size,
2536                           int empty)
2537 {
2538         struct btrfs_disk_key disk_key;
2539         struct extent_buffer *right = path->nodes[0];
2540         struct extent_buffer *left;
2541         int slot;
2542         int i;
2543         int free_space;
2544         int push_space = 0;
2545         int push_items = 0;
2546         struct btrfs_item *item;
2547         u32 old_left_nritems;
2548         u32 right_nritems;
2549         u32 nr;
2550         int ret = 0;
2551         int wret;
2552         u32 this_item_size;
2553         u32 old_left_item_size;
2554
2555         slot = path->slots[1];
2556         if (slot == 0)
2557                 return 1;
2558         if (!path->nodes[1])
2559                 return 1;
2560
2561         right_nritems = btrfs_header_nritems(right);
2562         if (right_nritems == 0)
2563                 return 1;
2564
2565         btrfs_assert_tree_locked(path->nodes[1]);
2566
2567         left = read_node_slot(root, path->nodes[1], slot - 1);
2568         btrfs_tree_lock(left);
2569         btrfs_set_lock_blocking(left);
2570
2571         free_space = btrfs_leaf_free_space(root, left);
2572         if (free_space < data_size) {
2573                 ret = 1;
2574                 goto out;
2575         }
2576
2577         /* cow and double check */
2578         ret = btrfs_cow_block(trans, root, left,
2579                               path->nodes[1], slot - 1, &left, 0);
2580         if (ret) {
2581                 /* we hit -ENOSPC, but it isn't fatal here */
2582                 ret = 1;
2583                 goto out;
2584         }
2585
2586         free_space = btrfs_leaf_free_space(root, left);
2587         if (free_space < data_size) {
2588                 ret = 1;
2589                 goto out;
2590         }
2591
2592         if (empty)
2593                 nr = right_nritems;
2594         else
2595                 nr = right_nritems - 1;
2596
2597         for (i = 0; i < nr; i++) {
2598                 item = btrfs_item_nr(right, i);
2599                 if (!right->map_token) {
2600                         map_extent_buffer(right, (unsigned long)item,
2601                                         sizeof(struct btrfs_item),
2602                                         &right->map_token, &right->kaddr,
2603                                         &right->map_start, &right->map_len,
2604                                         KM_USER1);
2605                 }
2606
2607                 if (!empty && push_items > 0) {
2608                         if (path->slots[0] < i)
2609                                 break;
2610                         if (path->slots[0] == i) {
2611                                 int space = btrfs_leaf_free_space(root, right);
2612                                 if (space + push_space * 2 > free_space)
2613                                         break;
2614                         }
2615                 }
2616
2617                 if (path->slots[0] == i)
2618                         push_space += data_size;
2619
2620                 this_item_size = btrfs_item_size(right, item);
2621                 if (this_item_size + sizeof(*item) + push_space > free_space)
2622                         break;
2623
2624                 push_items++;
2625                 push_space += this_item_size + sizeof(*item);
2626         }
2627
2628         if (right->map_token) {
2629                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2630                 right->map_token = NULL;
2631         }
2632
2633         if (push_items == 0) {
2634                 ret = 1;
2635                 goto out;
2636         }
2637         if (!empty && push_items == btrfs_header_nritems(right))
2638                 WARN_ON(1);
2639
2640         /* push data from right to left */
2641         copy_extent_buffer(left, right,
2642                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2643                            btrfs_item_nr_offset(0),
2644                            push_items * sizeof(struct btrfs_item));
2645
2646         push_space = BTRFS_LEAF_DATA_SIZE(root) -
2647                      btrfs_item_offset_nr(right, push_items - 1);
2648
2649         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2650                      leaf_data_end(root, left) - push_space,
2651                      btrfs_leaf_data(right) +
2652                      btrfs_item_offset_nr(right, push_items - 1),
2653                      push_space);
2654         old_left_nritems = btrfs_header_nritems(left);
2655         BUG_ON(old_left_nritems <= 0);
2656
2657         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2658         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2659                 u32 ioff;
2660
2661                 item = btrfs_item_nr(left, i);
2662                 if (!left->map_token) {
2663                         map_extent_buffer(left, (unsigned long)item,
2664                                         sizeof(struct btrfs_item),
2665                                         &left->map_token, &left->kaddr,
2666                                         &left->map_start, &left->map_len,
2667                                         KM_USER1);
2668                 }
2669
2670                 ioff = btrfs_item_offset(left, item);
2671                 btrfs_set_item_offset(left, item,
2672                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2673         }
2674         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2675         if (left->map_token) {
2676                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2677                 left->map_token = NULL;
2678         }
2679
2680         /* fixup right node */
2681         if (push_items > right_nritems) {
2682                 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2683                        right_nritems);
2684                 WARN_ON(1);
2685         }
2686
2687         if (push_items < right_nritems) {
2688                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2689                                                   leaf_data_end(root, right);
2690                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2691                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
2692                                       btrfs_leaf_data(right) +
2693                                       leaf_data_end(root, right), push_space);
2694
2695                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2696                               btrfs_item_nr_offset(push_items),
2697                              (btrfs_header_nritems(right) - push_items) *
2698                              sizeof(struct btrfs_item));
2699         }
2700         right_nritems -= push_items;
2701         btrfs_set_header_nritems(right, right_nritems);
2702         push_space = BTRFS_LEAF_DATA_SIZE(root);
2703         for (i = 0; i < right_nritems; i++) {
2704                 item = btrfs_item_nr(right, i);
2705
2706                 if (!right->map_token) {
2707                         map_extent_buffer(right, (unsigned long)item,
2708                                         sizeof(struct btrfs_item),
2709                                         &right->map_token, &right->kaddr,
2710                                         &right->map_start, &right->map_len,
2711                                         KM_USER1);
2712                 }
2713
2714                 push_space = push_space - btrfs_item_size(right, item);
2715                 btrfs_set_item_offset(right, item, push_space);
2716         }
2717         if (right->map_token) {
2718                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2719                 right->map_token = NULL;
2720         }
2721
2722         btrfs_mark_buffer_dirty(left);
2723         if (right_nritems)
2724                 btrfs_mark_buffer_dirty(right);
2725
2726         ret = btrfs_update_ref(trans, root, right, left,
2727                                old_left_nritems, push_items);
2728         BUG_ON(ret);
2729
2730         btrfs_item_key(right, &disk_key, 0);
2731         wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2732         if (wret)
2733                 ret = wret;
2734
2735         /* then fixup the leaf pointer in the path */
2736         if (path->slots[0] < push_items) {
2737                 path->slots[0] += old_left_nritems;
2738                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2739                         clean_tree_block(trans, root, path->nodes[0]);
2740                 btrfs_tree_unlock(path->nodes[0]);
2741                 free_extent_buffer(path->nodes[0]);
2742                 path->nodes[0] = left;
2743                 path->slots[1] -= 1;
2744         } else {
2745                 btrfs_tree_unlock(left);
2746                 free_extent_buffer(left);
2747                 path->slots[0] -= push_items;
2748         }
2749         BUG_ON(path->slots[0] < 0);
2750         return ret;
2751 out:
2752         btrfs_tree_unlock(left);
2753         free_extent_buffer(left);
2754         return ret;
2755 }
2756
2757 /*
2758  * split the path's leaf in two, making sure there is at least data_size
2759  * available for the resulting leaf level of the path.
2760  *
2761  * returns 0 if all went well and < 0 on failure.
2762  */
2763 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2764                                struct btrfs_root *root,
2765                                struct btrfs_key *ins_key,
2766                                struct btrfs_path *path, int data_size,
2767                                int extend)
2768 {
2769         struct extent_buffer *l;
2770         u32 nritems;
2771         int mid;
2772         int slot;
2773         struct extent_buffer *right;
2774         int data_copy_size;
2775         int rt_data_off;
2776         int i;
2777         int ret = 0;
2778         int wret;
2779         int double_split;
2780         int num_doubles = 0;
2781         struct btrfs_disk_key disk_key;
2782
2783         /* first try to make some room by pushing left and right */
2784         if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2785                 wret = push_leaf_right(trans, root, path, data_size, 0);
2786                 if (wret < 0)
2787                         return wret;
2788                 if (wret) {
2789                         wret = push_leaf_left(trans, root, path, data_size, 0);
2790                         if (wret < 0)
2791                                 return wret;
2792                 }
2793                 l = path->nodes[0];
2794
2795                 /* did the pushes work? */
2796                 if (btrfs_leaf_free_space(root, l) >= data_size)
2797                         return 0;
2798         }
2799
2800         if (!path->nodes[1]) {
2801                 ret = insert_new_root(trans, root, path, 1);
2802                 if (ret)
2803                         return ret;
2804         }
2805 again:
2806         double_split = 0;
2807         l = path->nodes[0];
2808         slot = path->slots[0];
2809         nritems = btrfs_header_nritems(l);
2810         mid = (nritems + 1) / 2;
2811
2812         right = btrfs_alloc_free_block(trans, root, root->leafsize,
2813                                         path->nodes[1]->start,
2814                                         root->root_key.objectid,
2815                                         trans->transid, 0, l->start, 0);
2816         if (IS_ERR(right)) {
2817                 BUG_ON(1);
2818                 return PTR_ERR(right);
2819         }
2820
2821         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2822         btrfs_set_header_bytenr(right, right->start);
2823         btrfs_set_header_generation(right, trans->transid);
2824         btrfs_set_header_owner(right, root->root_key.objectid);
2825         btrfs_set_header_level(right, 0);
2826         write_extent_buffer(right, root->fs_info->fsid,
2827                             (unsigned long)btrfs_header_fsid(right),
2828                             BTRFS_FSID_SIZE);
2829
2830         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2831                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
2832                             BTRFS_UUID_SIZE);
2833         if (mid <= slot) {
2834                 if (nritems == 1 ||
2835                     leaf_space_used(l, mid, nritems - mid) + data_size >
2836                         BTRFS_LEAF_DATA_SIZE(root)) {
2837                         if (slot >= nritems) {
2838                                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2839                                 btrfs_set_header_nritems(right, 0);
2840                                 wret = insert_ptr(trans, root, path,
2841                                                   &disk_key, right->start,
2842                                                   path->slots[1] + 1, 1);
2843                                 if (wret)
2844                                         ret = wret;
2845
2846                                 btrfs_tree_unlock(path->nodes[0]);
2847                                 free_extent_buffer(path->nodes[0]);
2848                                 path->nodes[0] = right;
2849                                 path->slots[0] = 0;
2850                                 path->slots[1] += 1;
2851                                 btrfs_mark_buffer_dirty(right);
2852                                 return ret;
2853                         }
2854                         mid = slot;
2855                         if (mid != nritems &&
2856                             leaf_space_used(l, mid, nritems - mid) +
2857                             data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2858                                 double_split = 1;
2859                         }
2860                 }
2861         } else {
2862                 if (leaf_space_used(l, 0, mid) + data_size >
2863                         BTRFS_LEAF_DATA_SIZE(root)) {
2864                         if (!extend && data_size && slot == 0) {
2865                                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2866                                 btrfs_set_header_nritems(right, 0);
2867                                 wret = insert_ptr(trans, root, path,
2868                                                   &disk_key,
2869                                                   right->start,
2870                                                   path->slots[1], 1);
2871                                 if (wret)
2872                                         ret = wret;
2873                                 btrfs_tree_unlock(path->nodes[0]);
2874                                 free_extent_buffer(path->nodes[0]);
2875                                 path->nodes[0] = right;
2876                                 path->slots[0] = 0;
2877                                 if (path->slots[1] == 0) {
2878                                         wret = fixup_low_keys(trans, root,
2879                                                       path, &disk_key, 1);
2880                                         if (wret)
2881                                                 ret = wret;
2882                                 }
2883                                 btrfs_mark_buffer_dirty(right);
2884                                 return ret;
2885                         } else if ((extend || !data_size) && slot == 0) {
2886                                 mid = 1;
2887                         } else {
2888                                 mid = slot;
2889                                 if (mid != nritems &&
2890                                     leaf_space_used(l, mid, nritems - mid) +
2891                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2892                                         double_split = 1;
2893                                 }
2894                         }
2895                 }
2896         }
2897         nritems = nritems - mid;
2898         btrfs_set_header_nritems(right, nritems);
2899         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2900
2901         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2902                            btrfs_item_nr_offset(mid),
2903                            nritems * sizeof(struct btrfs_item));
2904
2905         copy_extent_buffer(right, l,
2906                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2907                      data_copy_size, btrfs_leaf_data(l) +
2908                      leaf_data_end(root, l), data_copy_size);
2909
2910         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2911                       btrfs_item_end_nr(l, mid);
2912
2913         for (i = 0; i < nritems; i++) {
2914                 struct btrfs_item *item = btrfs_item_nr(right, i);
2915                 u32 ioff;
2916
2917                 if (!right->map_token) {
2918                         map_extent_buffer(right, (unsigned long)item,
2919                                         sizeof(struct btrfs_item),
2920                                         &right->map_token, &right->kaddr,
2921                                         &right->map_start, &right->map_len,
2922                                         KM_USER1);
2923                 }
2924
2925                 ioff = btrfs_item_offset(right, item);
2926                 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2927         }
2928
2929         if (right->map_token) {
2930                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2931                 right->map_token = NULL;
2932         }
2933
2934         btrfs_set_header_nritems(l, mid);
2935         ret = 0;
2936         btrfs_item_key(right, &disk_key, 0);
2937         wret = insert_ptr(trans, root, path, &disk_key, right->start,
2938                           path->slots[1] + 1, 1);
2939         if (wret)
2940                 ret = wret;
2941
2942         btrfs_mark_buffer_dirty(right);
2943         btrfs_mark_buffer_dirty(l);
2944         BUG_ON(path->slots[0] != slot);
2945
2946         ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2947         BUG_ON(ret);
2948
2949         if (mid <= slot) {
2950                 btrfs_tree_unlock(path->nodes[0]);
2951                 free_extent_buffer(path->nodes[0]);
2952                 path->nodes[0] = right;
2953                 path->slots[0] -= mid;
2954                 path->slots[1] += 1;
2955         } else {
2956                 btrfs_tree_unlock(right);
2957                 free_extent_buffer(right);
2958         }
2959
2960         BUG_ON(path->slots[0] < 0);
2961
2962         if (double_split) {
2963                 BUG_ON(num_doubles != 0);
2964                 num_doubles++;
2965                 goto again;
2966         }
2967         return ret;
2968 }
2969
2970 /*
2971  * This function splits a single item into two items,
2972  * giving 'new_key' to the new item and splitting the
2973  * old one at split_offset (from the start of the item).
2974  *
2975  * The path may be released by this operation.  After
2976  * the split, the path is pointing to the old item.  The
2977  * new item is going to be in the same node as the old one.
2978  *
2979  * Note, the item being split must be smaller enough to live alone on
2980  * a tree block with room for one extra struct btrfs_item
2981  *
2982  * This allows us to split the item in place, keeping a lock on the
2983  * leaf the entire time.
2984  */
2985 int btrfs_split_item(struct btrfs_trans_handle *trans,
2986                      struct btrfs_root *root,
2987                      struct btrfs_path *path,
2988                      struct btrfs_key *new_key,
2989                      unsigned long split_offset)
2990 {
2991         u32 item_size;
2992         struct extent_buffer *leaf;
2993         struct btrfs_key orig_key;
2994         struct btrfs_item *item;
2995         struct btrfs_item *new_item;
2996         int ret = 0;
2997         int slot;
2998         u32 nritems;
2999         u32 orig_offset;
3000         struct btrfs_disk_key disk_key;
3001         char *buf;
3002
3003         leaf = path->nodes[0];
3004         btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
3005         if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
3006                 goto split;
3007
3008         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3009         btrfs_release_path(root, path);
3010
3011         path->search_for_split = 1;
3012         path->keep_locks = 1;
3013
3014         ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
3015         path->search_for_split = 0;
3016
3017         /* if our item isn't there or got smaller, return now */
3018         if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
3019                                                         path->slots[0])) {
3020                 path->keep_locks = 0;
3021                 return -EAGAIN;
3022         }
3023
3024         ret = split_leaf(trans, root, &orig_key, path,
3025                          sizeof(struct btrfs_item), 1);
3026         path->keep_locks = 0;
3027         BUG_ON(ret);
3028
3029         /*
3030          * make sure any changes to the path from split_leaf leave it
3031          * in a blocking state
3032          */
3033         btrfs_set_path_blocking(path);
3034
3035         leaf = path->nodes[0];
3036         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3037
3038 split:
3039         item = btrfs_item_nr(leaf, path->slots[0]);
3040         orig_offset = btrfs_item_offset(leaf, item);
3041         item_size = btrfs_item_size(leaf, item);
3042
3043
3044         buf = kmalloc(item_size, GFP_NOFS);
3045         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3046                             path->slots[0]), item_size);
3047         slot = path->slots[0] + 1;
3048         leaf = path->nodes[0];
3049
3050         nritems = btrfs_header_nritems(leaf);
3051
3052         if (slot != nritems) {
3053                 /* shift the items */
3054                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3055                               btrfs_item_nr_offset(slot),
3056                               (nritems - slot) * sizeof(struct btrfs_item));
3057
3058         }
3059
3060         btrfs_cpu_key_to_disk(&disk_key, new_key);
3061         btrfs_set_item_key(leaf, &disk_key, slot);
3062
3063         new_item = btrfs_item_nr(leaf, slot);
3064
3065         btrfs_set_item_offset(leaf, new_item, orig_offset);
3066         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3067
3068         btrfs_set_item_offset(leaf, item,
3069                               orig_offset + item_size - split_offset);
3070         btrfs_set_item_size(leaf, item, split_offset);
3071
3072         btrfs_set_header_nritems(leaf, nritems + 1);
3073
3074         /* write the data for the start of the original item */
3075         write_extent_buffer(leaf, buf,
3076                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3077                             split_offset);