Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
[sfrench/cifs-2.6.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include <linux/prefetch.h>
14 #include <linux/cleancache.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20
21 static struct kmem_cache *extent_state_cache;
22 static struct kmem_cache *extent_buffer_cache;
23
24 static LIST_HEAD(buffers);
25 static LIST_HEAD(states);
26
27 #define LEAK_DEBUG 0
28 #if LEAK_DEBUG
29 static DEFINE_SPINLOCK(leak_lock);
30 #endif
31
32 #define BUFFER_LRU_MAX 64
33
34 struct tree_entry {
35         u64 start;
36         u64 end;
37         struct rb_node rb_node;
38 };
39
40 struct extent_page_data {
41         struct bio *bio;
42         struct extent_io_tree *tree;
43         get_extent_t *get_extent;
44
45         /* tells writepage not to lock the state bits for this range
46          * it still does the unlocking
47          */
48         unsigned int extent_locked:1;
49
50         /* tells the submit_bio code to use a WRITE_SYNC */
51         unsigned int sync_io:1;
52 };
53
54 int __init extent_io_init(void)
55 {
56         extent_state_cache = kmem_cache_create("extent_state",
57                         sizeof(struct extent_state), 0,
58                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
59         if (!extent_state_cache)
60                 return -ENOMEM;
61
62         extent_buffer_cache = kmem_cache_create("extent_buffers",
63                         sizeof(struct extent_buffer), 0,
64                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
65         if (!extent_buffer_cache)
66                 goto free_state_cache;
67         return 0;
68
69 free_state_cache:
70         kmem_cache_destroy(extent_state_cache);
71         return -ENOMEM;
72 }
73
74 void extent_io_exit(void)
75 {
76         struct extent_state *state;
77         struct extent_buffer *eb;
78
79         while (!list_empty(&states)) {
80                 state = list_entry(states.next, struct extent_state, leak_list);
81                 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
82                        "state %lu in tree %p refs %d\n",
83                        (unsigned long long)state->start,
84                        (unsigned long long)state->end,
85                        state->state, state->tree, atomic_read(&state->refs));
86                 list_del(&state->leak_list);
87                 kmem_cache_free(extent_state_cache, state);
88
89         }
90
91         while (!list_empty(&buffers)) {
92                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
93                 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
94                        "refs %d\n", (unsigned long long)eb->start,
95                        eb->len, atomic_read(&eb->refs));
96                 list_del(&eb->leak_list);
97                 kmem_cache_free(extent_buffer_cache, eb);
98         }
99         if (extent_state_cache)
100                 kmem_cache_destroy(extent_state_cache);
101         if (extent_buffer_cache)
102                 kmem_cache_destroy(extent_buffer_cache);
103 }
104
105 void extent_io_tree_init(struct extent_io_tree *tree,
106                          struct address_space *mapping)
107 {
108         tree->state = RB_ROOT;
109         INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
110         tree->ops = NULL;
111         tree->dirty_bytes = 0;
112         spin_lock_init(&tree->lock);
113         spin_lock_init(&tree->buffer_lock);
114         tree->mapping = mapping;
115 }
116
117 static struct extent_state *alloc_extent_state(gfp_t mask)
118 {
119         struct extent_state *state;
120 #if LEAK_DEBUG
121         unsigned long flags;
122 #endif
123
124         state = kmem_cache_alloc(extent_state_cache, mask);
125         if (!state)
126                 return state;
127         state->state = 0;
128         state->private = 0;
129         state->tree = NULL;
130 #if LEAK_DEBUG
131         spin_lock_irqsave(&leak_lock, flags);
132         list_add(&state->leak_list, &states);
133         spin_unlock_irqrestore(&leak_lock, flags);
134 #endif
135         atomic_set(&state->refs, 1);
136         init_waitqueue_head(&state->wq);
137         return state;
138 }
139
140 void free_extent_state(struct extent_state *state)
141 {
142         if (!state)
143                 return;
144         if (atomic_dec_and_test(&state->refs)) {
145 #if LEAK_DEBUG
146                 unsigned long flags;
147 #endif
148                 WARN_ON(state->tree);
149 #if LEAK_DEBUG
150                 spin_lock_irqsave(&leak_lock, flags);
151                 list_del(&state->leak_list);
152                 spin_unlock_irqrestore(&leak_lock, flags);
153 #endif
154                 kmem_cache_free(extent_state_cache, state);
155         }
156 }
157
158 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
159                                    struct rb_node *node)
160 {
161         struct rb_node **p = &root->rb_node;
162         struct rb_node *parent = NULL;
163         struct tree_entry *entry;
164
165         while (*p) {
166                 parent = *p;
167                 entry = rb_entry(parent, struct tree_entry, rb_node);
168
169                 if (offset < entry->start)
170                         p = &(*p)->rb_left;
171                 else if (offset > entry->end)
172                         p = &(*p)->rb_right;
173                 else
174                         return parent;
175         }
176
177         entry = rb_entry(node, struct tree_entry, rb_node);
178         rb_link_node(node, parent, p);
179         rb_insert_color(node, root);
180         return NULL;
181 }
182
183 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
184                                      struct rb_node **prev_ret,
185                                      struct rb_node **next_ret)
186 {
187         struct rb_root *root = &tree->state;
188         struct rb_node *n = root->rb_node;
189         struct rb_node *prev = NULL;
190         struct rb_node *orig_prev = NULL;
191         struct tree_entry *entry;
192         struct tree_entry *prev_entry = NULL;
193
194         while (n) {
195                 entry = rb_entry(n, struct tree_entry, rb_node);
196                 prev = n;
197                 prev_entry = entry;
198
199                 if (offset < entry->start)
200                         n = n->rb_left;
201                 else if (offset > entry->end)
202                         n = n->rb_right;
203                 else
204                         return n;
205         }
206
207         if (prev_ret) {
208                 orig_prev = prev;
209                 while (prev && offset > prev_entry->end) {
210                         prev = rb_next(prev);
211                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
212                 }
213                 *prev_ret = prev;
214                 prev = orig_prev;
215         }
216
217         if (next_ret) {
218                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
219                 while (prev && offset < prev_entry->start) {
220                         prev = rb_prev(prev);
221                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
222                 }
223                 *next_ret = prev;
224         }
225         return NULL;
226 }
227
228 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
229                                           u64 offset)
230 {
231         struct rb_node *prev = NULL;
232         struct rb_node *ret;
233
234         ret = __etree_search(tree, offset, &prev, NULL);
235         if (!ret)
236                 return prev;
237         return ret;
238 }
239
240 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
241                      struct extent_state *other)
242 {
243         if (tree->ops && tree->ops->merge_extent_hook)
244                 tree->ops->merge_extent_hook(tree->mapping->host, new,
245                                              other);
246 }
247
248 /*
249  * utility function to look for merge candidates inside a given range.
250  * Any extents with matching state are merged together into a single
251  * extent in the tree.  Extents with EXTENT_IO in their state field
252  * are not merged because the end_io handlers need to be able to do
253  * operations on them without sleeping (or doing allocations/splits).
254  *
255  * This should be called with the tree lock held.
256  */
257 static int merge_state(struct extent_io_tree *tree,
258                        struct extent_state *state)
259 {
260         struct extent_state *other;
261         struct rb_node *other_node;
262
263         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
264                 return 0;
265
266         other_node = rb_prev(&state->rb_node);
267         if (other_node) {
268                 other = rb_entry(other_node, struct extent_state, rb_node);
269                 if (other->end == state->start - 1 &&
270                     other->state == state->state) {
271                         merge_cb(tree, state, other);
272                         state->start = other->start;
273                         other->tree = NULL;
274                         rb_erase(&other->rb_node, &tree->state);
275                         free_extent_state(other);
276                 }
277         }
278         other_node = rb_next(&state->rb_node);
279         if (other_node) {
280                 other = rb_entry(other_node, struct extent_state, rb_node);
281                 if (other->start == state->end + 1 &&
282                     other->state == state->state) {
283                         merge_cb(tree, state, other);
284                         other->start = state->start;
285                         state->tree = NULL;
286                         rb_erase(&state->rb_node, &tree->state);
287                         free_extent_state(state);
288                         state = NULL;
289                 }
290         }
291
292         return 0;
293 }
294
295 static int set_state_cb(struct extent_io_tree *tree,
296                          struct extent_state *state, int *bits)
297 {
298         if (tree->ops && tree->ops->set_bit_hook) {
299                 return tree->ops->set_bit_hook(tree->mapping->host,
300                                                state, bits);
301         }
302
303         return 0;
304 }
305
306 static void clear_state_cb(struct extent_io_tree *tree,
307                            struct extent_state *state, int *bits)
308 {
309         if (tree->ops && tree->ops->clear_bit_hook)
310                 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
311 }
312
313 /*
314  * insert an extent_state struct into the tree.  'bits' are set on the
315  * struct before it is inserted.
316  *
317  * This may return -EEXIST if the extent is already there, in which case the
318  * state struct is freed.
319  *
320  * The tree lock is not taken internally.  This is a utility function and
321  * probably isn't what you want to call (see set/clear_extent_bit).
322  */
323 static int insert_state(struct extent_io_tree *tree,
324                         struct extent_state *state, u64 start, u64 end,
325                         int *bits)
326 {
327         struct rb_node *node;
328         int bits_to_set = *bits & ~EXTENT_CTLBITS;
329         int ret;
330
331         if (end < start) {
332                 printk(KERN_ERR "btrfs end < start %llu %llu\n",
333                        (unsigned long long)end,
334                        (unsigned long long)start);
335                 WARN_ON(1);
336         }
337         state->start = start;
338         state->end = end;
339         ret = set_state_cb(tree, state, bits);
340         if (ret)
341                 return ret;
342
343         if (bits_to_set & EXTENT_DIRTY)
344                 tree->dirty_bytes += end - start + 1;
345         state->state |= bits_to_set;
346         node = tree_insert(&tree->state, end, &state->rb_node);
347         if (node) {
348                 struct extent_state *found;
349                 found = rb_entry(node, struct extent_state, rb_node);
350                 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
351                        "%llu %llu\n", (unsigned long long)found->start,
352                        (unsigned long long)found->end,
353                        (unsigned long long)start, (unsigned long long)end);
354                 free_extent_state(state);
355                 return -EEXIST;
356         }
357         state->tree = tree;
358         merge_state(tree, state);
359         return 0;
360 }
361
362 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
363                      u64 split)
364 {
365         if (tree->ops && tree->ops->split_extent_hook)
366                 return tree->ops->split_extent_hook(tree->mapping->host,
367                                                     orig, split);
368         return 0;
369 }
370
371 /*
372  * split a given extent state struct in two, inserting the preallocated
373  * struct 'prealloc' as the newly created second half.  'split' indicates an
374  * offset inside 'orig' where it should be split.
375  *
376  * Before calling,
377  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
378  * are two extent state structs in the tree:
379  * prealloc: [orig->start, split - 1]
380  * orig: [ split, orig->end ]
381  *
382  * The tree locks are not taken by this function. They need to be held
383  * by the caller.
384  */
385 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
386                        struct extent_state *prealloc, u64 split)
387 {
388         struct rb_node *node;
389
390         split_cb(tree, orig, split);
391
392         prealloc->start = orig->start;
393         prealloc->end = split - 1;
394         prealloc->state = orig->state;
395         orig->start = split;
396
397         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
398         if (node) {
399                 free_extent_state(prealloc);
400                 return -EEXIST;
401         }
402         prealloc->tree = tree;
403         return 0;
404 }
405
406 /*
407  * utility function to clear some bits in an extent state struct.
408  * it will optionally wake up any one waiting on this state (wake == 1), or
409  * forcibly remove the state from the tree (delete == 1).
410  *
411  * If no bits are set on the state struct after clearing things, the
412  * struct is freed and removed from the tree
413  */
414 static int clear_state_bit(struct extent_io_tree *tree,
415                             struct extent_state *state,
416                             int *bits, int wake)
417 {
418         int bits_to_clear = *bits & ~EXTENT_CTLBITS;
419         int ret = state->state & bits_to_clear;
420
421         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
422                 u64 range = state->end - state->start + 1;
423                 WARN_ON(range > tree->dirty_bytes);
424                 tree->dirty_bytes -= range;
425         }
426         clear_state_cb(tree, state, bits);
427         state->state &= ~bits_to_clear;
428         if (wake)
429                 wake_up(&state->wq);
430         if (state->state == 0) {
431                 if (state->tree) {
432                         rb_erase(&state->rb_node, &tree->state);
433                         state->tree = NULL;
434                         free_extent_state(state);
435                 } else {
436                         WARN_ON(1);
437                 }
438         } else {
439                 merge_state(tree, state);
440         }
441         return ret;
442 }
443
444 static struct extent_state *
445 alloc_extent_state_atomic(struct extent_state *prealloc)
446 {
447         if (!prealloc)
448                 prealloc = alloc_extent_state(GFP_ATOMIC);
449
450         return prealloc;
451 }
452
453 /*
454  * clear some bits on a range in the tree.  This may require splitting
455  * or inserting elements in the tree, so the gfp mask is used to
456  * indicate which allocations or sleeping are allowed.
457  *
458  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
459  * the given range from the tree regardless of state (ie for truncate).
460  *
461  * the range [start, end] is inclusive.
462  *
463  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
464  * bits were already set, or zero if none of the bits were already set.
465  */
466 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
467                      int bits, int wake, int delete,
468                      struct extent_state **cached_state,
469                      gfp_t mask)
470 {
471         struct extent_state *state;
472         struct extent_state *cached;
473         struct extent_state *prealloc = NULL;
474         struct rb_node *next_node;
475         struct rb_node *node;
476         u64 last_end;
477         int err;
478         int set = 0;
479         int clear = 0;
480
481         if (delete)
482                 bits |= ~EXTENT_CTLBITS;
483         bits |= EXTENT_FIRST_DELALLOC;
484
485         if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
486                 clear = 1;
487 again:
488         if (!prealloc && (mask & __GFP_WAIT)) {
489                 prealloc = alloc_extent_state(mask);
490                 if (!prealloc)
491                         return -ENOMEM;
492         }
493
494         spin_lock(&tree->lock);
495         if (cached_state) {
496                 cached = *cached_state;
497
498                 if (clear) {
499                         *cached_state = NULL;
500                         cached_state = NULL;
501                 }
502
503                 if (cached && cached->tree && cached->start == start) {
504                         if (clear)
505                                 atomic_dec(&cached->refs);
506                         state = cached;
507                         goto hit_next;
508                 }
509                 if (clear)
510                         free_extent_state(cached);
511         }
512         /*
513          * this search will find the extents that end after
514          * our range starts
515          */
516         node = tree_search(tree, start);
517         if (!node)
518                 goto out;
519         state = rb_entry(node, struct extent_state, rb_node);
520 hit_next:
521         if (state->start > end)
522                 goto out;
523         WARN_ON(state->end < start);
524         last_end = state->end;
525
526         /*
527          *     | ---- desired range ---- |
528          *  | state | or
529          *  | ------------- state -------------- |
530          *
531          * We need to split the extent we found, and may flip
532          * bits on second half.
533          *
534          * If the extent we found extends past our range, we
535          * just split and search again.  It'll get split again
536          * the next time though.
537          *
538          * If the extent we found is inside our range, we clear
539          * the desired bit on it.
540          */
541
542         if (state->start < start) {
543                 prealloc = alloc_extent_state_atomic(prealloc);
544                 BUG_ON(!prealloc);
545                 err = split_state(tree, state, prealloc, start);
546                 BUG_ON(err == -EEXIST);
547                 prealloc = NULL;
548                 if (err)
549                         goto out;
550                 if (state->end <= end) {
551                         set |= clear_state_bit(tree, state, &bits, wake);
552                         if (last_end == (u64)-1)
553                                 goto out;
554                         start = last_end + 1;
555                 }
556                 goto search_again;
557         }
558         /*
559          * | ---- desired range ---- |
560          *                        | state |
561          * We need to split the extent, and clear the bit
562          * on the first half
563          */
564         if (state->start <= end && state->end > end) {
565                 prealloc = alloc_extent_state_atomic(prealloc);
566                 BUG_ON(!prealloc);
567                 err = split_state(tree, state, prealloc, end + 1);
568                 BUG_ON(err == -EEXIST);
569                 if (wake)
570                         wake_up(&state->wq);
571
572                 set |= clear_state_bit(tree, prealloc, &bits, wake);
573
574                 prealloc = NULL;
575                 goto out;
576         }
577
578         if (state->end < end && prealloc && !need_resched())
579                 next_node = rb_next(&state->rb_node);
580         else
581                 next_node = NULL;
582
583         set |= clear_state_bit(tree, state, &bits, wake);
584         if (last_end == (u64)-1)
585                 goto out;
586         start = last_end + 1;
587         if (start <= end && next_node) {
588                 state = rb_entry(next_node, struct extent_state,
589                                  rb_node);
590                 if (state->start == start)
591                         goto hit_next;
592         }
593         goto search_again;
594
595 out:
596         spin_unlock(&tree->lock);
597         if (prealloc)
598                 free_extent_state(prealloc);
599
600         return set;
601
602 search_again:
603         if (start > end)
604                 goto out;
605         spin_unlock(&tree->lock);
606         if (mask & __GFP_WAIT)
607                 cond_resched();
608         goto again;
609 }
610
611 static int wait_on_state(struct extent_io_tree *tree,
612                          struct extent_state *state)
613                 __releases(tree->lock)
614                 __acquires(tree->lock)
615 {
616         DEFINE_WAIT(wait);
617         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
618         spin_unlock(&tree->lock);
619         schedule();
620         spin_lock(&tree->lock);
621         finish_wait(&state->wq, &wait);
622         return 0;
623 }
624
625 /*
626  * waits for one or more bits to clear on a range in the state tree.
627  * The range [start, end] is inclusive.
628  * The tree lock is taken by this function
629  */
630 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
631 {
632         struct extent_state *state;
633         struct rb_node *node;
634
635         spin_lock(&tree->lock);
636 again:
637         while (1) {
638                 /*
639                  * this search will find all the extents that end after
640                  * our range starts
641                  */
642                 node = tree_search(tree, start);
643                 if (!node)
644                         break;
645
646                 state = rb_entry(node, struct extent_state, rb_node);
647
648                 if (state->start > end)
649                         goto out;
650
651                 if (state->state & bits) {
652                         start = state->start;
653                         atomic_inc(&state->refs);
654                         wait_on_state(tree, state);
655                         free_extent_state(state);
656                         goto again;
657                 }
658                 start = state->end + 1;
659
660                 if (start > end)
661                         break;
662
663                 if (need_resched()) {
664                         spin_unlock(&tree->lock);
665                         cond_resched();
666                         spin_lock(&tree->lock);
667                 }
668         }
669 out:
670         spin_unlock(&tree->lock);
671         return 0;
672 }
673
674 static int set_state_bits(struct extent_io_tree *tree,
675                            struct extent_state *state,
676                            int *bits)
677 {
678         int ret;
679         int bits_to_set = *bits & ~EXTENT_CTLBITS;
680
681         ret = set_state_cb(tree, state, bits);
682         if (ret)
683                 return ret;
684         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
685                 u64 range = state->end - state->start + 1;
686                 tree->dirty_bytes += range;
687         }
688         state->state |= bits_to_set;
689
690         return 0;
691 }
692
693 static void cache_state(struct extent_state *state,
694                         struct extent_state **cached_ptr)
695 {
696         if (cached_ptr && !(*cached_ptr)) {
697                 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
698                         *cached_ptr = state;
699                         atomic_inc(&state->refs);
700                 }
701         }
702 }
703
704 static void uncache_state(struct extent_state **cached_ptr)
705 {
706         if (cached_ptr && (*cached_ptr)) {
707                 struct extent_state *state = *cached_ptr;
708                 *cached_ptr = NULL;
709                 free_extent_state(state);
710         }
711 }
712
713 /*
714  * set some bits on a range in the tree.  This may require allocations or
715  * sleeping, so the gfp mask is used to indicate what is allowed.
716  *
717  * If any of the exclusive bits are set, this will fail with -EEXIST if some
718  * part of the range already has the desired bits set.  The start of the
719  * existing range is returned in failed_start in this case.
720  *
721  * [start, end] is inclusive This takes the tree lock.
722  */
723
724 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
725                    int bits, int exclusive_bits, u64 *failed_start,
726                    struct extent_state **cached_state, gfp_t mask)
727 {
728         struct extent_state *state;
729         struct extent_state *prealloc = NULL;
730         struct rb_node *node;
731         int err = 0;
732         u64 last_start;
733         u64 last_end;
734
735         bits |= EXTENT_FIRST_DELALLOC;
736 again:
737         if (!prealloc && (mask & __GFP_WAIT)) {
738                 prealloc = alloc_extent_state(mask);
739                 BUG_ON(!prealloc);
740         }
741
742         spin_lock(&tree->lock);
743         if (cached_state && *cached_state) {
744                 state = *cached_state;
745                 if (state->start == start && state->tree) {
746                         node = &state->rb_node;
747                         goto hit_next;
748                 }
749         }
750         /*
751          * this search will find all the extents that end after
752          * our range starts.
753          */
754         node = tree_search(tree, start);
755         if (!node) {
756                 prealloc = alloc_extent_state_atomic(prealloc);
757                 BUG_ON(!prealloc);
758                 err = insert_state(tree, prealloc, start, end, &bits);
759                 prealloc = NULL;
760                 BUG_ON(err == -EEXIST);
761                 goto out;
762         }
763         state = rb_entry(node, struct extent_state, rb_node);
764 hit_next:
765         last_start = state->start;
766         last_end = state->end;
767
768         /*
769          * | ---- desired range ---- |
770          * | state |
771          *
772          * Just lock what we found and keep going
773          */
774         if (state->start == start && state->end <= end) {
775                 struct rb_node *next_node;
776                 if (state->state & exclusive_bits) {
777                         *failed_start = state->start;
778                         err = -EEXIST;
779                         goto out;
780                 }
781
782                 err = set_state_bits(tree, state, &bits);
783                 if (err)
784                         goto out;
785
786                 next_node = rb_next(node);
787                 cache_state(state, cached_state);
788                 merge_state(tree, state);
789                 if (last_end == (u64)-1)
790                         goto out;
791
792                 start = last_end + 1;
793                 if (next_node && start < end && prealloc && !need_resched()) {
794                         state = rb_entry(next_node, struct extent_state,
795                                          rb_node);
796                         if (state->start == start)
797                                 goto hit_next;
798                 }
799                 goto search_again;
800         }
801
802         /*
803          *     | ---- desired range ---- |
804          * | state |
805          *   or
806          * | ------------- state -------------- |
807          *
808          * We need to split the extent we found, and may flip bits on
809          * second half.
810          *
811          * If the extent we found extends past our
812          * range, we just split and search again.  It'll get split
813          * again the next time though.
814          *
815          * If the extent we found is inside our range, we set the
816          * desired bit on it.
817          */
818         if (state->start < start) {
819                 if (state->state & exclusive_bits) {
820                         *failed_start = start;
821                         err = -EEXIST;
822                         goto out;
823                 }
824
825                 prealloc = alloc_extent_state_atomic(prealloc);
826                 BUG_ON(!prealloc);
827                 err = split_state(tree, state, prealloc, start);
828                 BUG_ON(err == -EEXIST);
829                 prealloc = NULL;
830                 if (err)
831                         goto out;
832                 if (state->end <= end) {
833                         err = set_state_bits(tree, state, &bits);
834                         if (err)
835                                 goto out;
836                         cache_state(state, cached_state);
837                         merge_state(tree, state);
838                         if (last_end == (u64)-1)
839                                 goto out;
840                         start = last_end + 1;
841                 }
842                 goto search_again;
843         }
844         /*
845          * | ---- desired range ---- |
846          *     | state | or               | state |
847          *
848          * There's a hole, we need to insert something in it and
849          * ignore the extent we found.
850          */
851         if (state->start > start) {
852                 u64 this_end;
853                 if (end < last_start)
854                         this_end = end;
855                 else
856                         this_end = last_start - 1;
857
858                 prealloc = alloc_extent_state_atomic(prealloc);
859                 BUG_ON(!prealloc);
860
861                 /*
862                  * Avoid to free 'prealloc' if it can be merged with
863                  * the later extent.
864                  */
865                 atomic_inc(&prealloc->refs);
866                 err = insert_state(tree, prealloc, start, this_end,
867                                    &bits);
868                 BUG_ON(err == -EEXIST);
869                 if (err) {
870                         free_extent_state(prealloc);
871                         prealloc = NULL;
872                         goto out;
873                 }
874                 cache_state(prealloc, cached_state);
875                 free_extent_state(prealloc);
876                 prealloc = NULL;
877                 start = this_end + 1;
878                 goto search_again;
879         }
880         /*
881          * | ---- desired range ---- |
882          *                        | state |
883          * We need to split the extent, and set the bit
884          * on the first half
885          */
886         if (state->start <= end && state->end > end) {
887                 if (state->state & exclusive_bits) {
888                         *failed_start = start;
889                         err = -EEXIST;
890                         goto out;
891                 }
892
893                 prealloc = alloc_extent_state_atomic(prealloc);
894                 BUG_ON(!prealloc);
895                 err = split_state(tree, state, prealloc, end + 1);
896                 BUG_ON(err == -EEXIST);
897
898                 err = set_state_bits(tree, prealloc, &bits);
899                 if (err) {
900                         prealloc = NULL;
901                         goto out;
902                 }
903                 cache_state(prealloc, cached_state);
904                 merge_state(tree, prealloc);
905                 prealloc = NULL;
906                 goto out;
907         }
908
909         goto search_again;
910
911 out:
912         spin_unlock(&tree->lock);
913         if (prealloc)
914                 free_extent_state(prealloc);
915
916         return err;
917
918 search_again:
919         if (start > end)
920                 goto out;
921         spin_unlock(&tree->lock);
922         if (mask & __GFP_WAIT)
923                 cond_resched();
924         goto again;
925 }
926
927 /* wrappers around set/clear extent bit */
928 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
929                      gfp_t mask)
930 {
931         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
932                               NULL, mask);
933 }
934
935 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
936                     int bits, gfp_t mask)
937 {
938         return set_extent_bit(tree, start, end, bits, 0, NULL,
939                               NULL, mask);
940 }
941
942 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
943                       int bits, gfp_t mask)
944 {
945         return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
946 }
947
948 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949                         struct extent_state **cached_state, gfp_t mask)
950 {
951         return set_extent_bit(tree, start, end,
952                               EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953                               0, NULL, cached_state, mask);
954 }
955
956 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
957                        gfp_t mask)
958 {
959         return clear_extent_bit(tree, start, end,
960                                 EXTENT_DIRTY | EXTENT_DELALLOC |
961                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
962 }
963
964 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
965                      gfp_t mask)
966 {
967         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
968                               NULL, mask);
969 }
970
971 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
972                         struct extent_state **cached_state, gfp_t mask)
973 {
974         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
975                               NULL, cached_state, mask);
976 }
977
978 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
979                                  u64 end, struct extent_state **cached_state,
980                                  gfp_t mask)
981 {
982         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
983                                 cached_state, mask);
984 }
985
986 /*
987  * either insert or lock state struct between start and end use mask to tell
988  * us if waiting is desired.
989  */
990 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
991                      int bits, struct extent_state **cached_state, gfp_t mask)
992 {
993         int err;
994         u64 failed_start;
995         while (1) {
996                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
997                                      EXTENT_LOCKED, &failed_start,
998                                      cached_state, mask);
999                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
1000                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1001                         start = failed_start;
1002                 } else {
1003                         break;
1004                 }
1005                 WARN_ON(start > end);
1006         }
1007         return err;
1008 }
1009
1010 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1011 {
1012         return lock_extent_bits(tree, start, end, 0, NULL, mask);
1013 }
1014
1015 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1016                     gfp_t mask)
1017 {
1018         int err;
1019         u64 failed_start;
1020
1021         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1022                              &failed_start, NULL, mask);
1023         if (err == -EEXIST) {
1024                 if (failed_start > start)
1025                         clear_extent_bit(tree, start, failed_start - 1,
1026                                          EXTENT_LOCKED, 1, 0, NULL, mask);
1027                 return 0;
1028         }
1029         return 1;
1030 }
1031
1032 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1033                          struct extent_state **cached, gfp_t mask)
1034 {
1035         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1036                                 mask);
1037 }
1038
1039 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1040 {
1041         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1042                                 mask);
1043 }
1044
1045 /*
1046  * helper function to set both pages and extents in the tree writeback
1047  */
1048 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1049 {
1050         unsigned long index = start >> PAGE_CACHE_SHIFT;
1051         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1052         struct page *page;
1053
1054         while (index <= end_index) {
1055                 page = find_get_page(tree->mapping, index);
1056                 BUG_ON(!page);
1057                 set_page_writeback(page);
1058                 page_cache_release(page);
1059                 index++;
1060         }
1061         return 0;
1062 }
1063
1064 /*
1065  * find the first offset in the io tree with 'bits' set. zero is
1066  * returned if we find something, and *start_ret and *end_ret are
1067  * set to reflect the state struct that was found.
1068  *
1069  * If nothing was found, 1 is returned, < 0 on error
1070  */
1071 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1072                           u64 *start_ret, u64 *end_ret, int bits)
1073 {
1074         struct rb_node *node;
1075         struct extent_state *state;
1076         int ret = 1;
1077
1078         spin_lock(&tree->lock);
1079         /*
1080          * this search will find all the extents that end after
1081          * our range starts.
1082          */
1083         node = tree_search(tree, start);
1084         if (!node)
1085                 goto out;
1086
1087         while (1) {
1088                 state = rb_entry(node, struct extent_state, rb_node);
1089                 if (state->end >= start && (state->state & bits)) {
1090                         *start_ret = state->start;
1091                         *end_ret = state->end;
1092                         ret = 0;
1093                         break;
1094                 }
1095                 node = rb_next(node);
1096                 if (!node)
1097                         break;
1098         }
1099 out:
1100         spin_unlock(&tree->lock);
1101         return ret;
1102 }
1103
1104 /* find the first state struct with 'bits' set after 'start', and
1105  * return it.  tree->lock must be held.  NULL will returned if
1106  * nothing was found after 'start'
1107  */
1108 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1109                                                  u64 start, int bits)
1110 {
1111         struct rb_node *node;
1112         struct extent_state *state;
1113
1114         /*
1115          * this search will find all the extents that end after
1116          * our range starts.
1117          */
1118         node = tree_search(tree, start);
1119         if (!node)
1120                 goto out;
1121
1122         while (1) {
1123                 state = rb_entry(node, struct extent_state, rb_node);
1124                 if (state->end >= start && (state->state & bits))
1125                         return state;
1126
1127                 node = rb_next(node);
1128                 if (!node)
1129                         break;
1130         }
1131 out:
1132         return NULL;
1133 }
1134
1135 /*
1136  * find a contiguous range of bytes in the file marked as delalloc, not
1137  * more than 'max_bytes'.  start and end are used to return the range,
1138  *
1139  * 1 is returned if we find something, 0 if nothing was in the tree
1140  */
1141 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1142                                         u64 *start, u64 *end, u64 max_bytes,
1143                                         struct extent_state **cached_state)
1144 {
1145         struct rb_node *node;
1146         struct extent_state *state;
1147         u64 cur_start = *start;
1148         u64 found = 0;
1149         u64 total_bytes = 0;
1150
1151         spin_lock(&tree->lock);
1152
1153         /*
1154          * this search will find all the extents that end after
1155          * our range starts.
1156          */
1157         node = tree_search(tree, cur_start);
1158         if (!node) {
1159                 if (!found)
1160                         *end = (u64)-1;
1161                 goto out;
1162         }
1163
1164         while (1) {
1165                 state = rb_entry(node, struct extent_state, rb_node);
1166                 if (found && (state->start != cur_start ||
1167                               (state->state & EXTENT_BOUNDARY))) {
1168                         goto out;
1169                 }
1170                 if (!(state->state & EXTENT_DELALLOC)) {
1171                         if (!found)
1172                                 *end = state->end;
1173                         goto out;
1174                 }
1175                 if (!found) {
1176                         *start = state->start;
1177                         *cached_state = state;
1178                         atomic_inc(&state->refs);
1179                 }
1180                 found++;
1181                 *end = state->end;
1182                 cur_start = state->end + 1;
1183                 node = rb_next(node);
1184                 if (!node)
1185                         break;
1186                 total_bytes += state->end - state->start + 1;
1187                 if (total_bytes >= max_bytes)
1188                         break;
1189         }
1190 out:
1191         spin_unlock(&tree->lock);
1192         return found;
1193 }
1194
1195 static noinline int __unlock_for_delalloc(struct inode *inode,
1196                                           struct page *locked_page,
1197                                           u64 start, u64 end)
1198 {
1199         int ret;
1200         struct page *pages[16];
1201         unsigned long index = start >> PAGE_CACHE_SHIFT;
1202         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1203         unsigned long nr_pages = end_index - index + 1;
1204         int i;
1205
1206         if (index == locked_page->index && end_index == index)
1207                 return 0;
1208
1209         while (nr_pages > 0) {
1210                 ret = find_get_pages_contig(inode->i_mapping, index,
1211                                      min_t(unsigned long, nr_pages,
1212                                      ARRAY_SIZE(pages)), pages);
1213                 for (i = 0; i < ret; i++) {
1214                         if (pages[i] != locked_page)
1215                                 unlock_page(pages[i]);
1216                         page_cache_release(pages[i]);
1217                 }
1218                 nr_pages -= ret;
1219                 index += ret;
1220                 cond_resched();
1221         }
1222         return 0;
1223 }
1224
1225 static noinline int lock_delalloc_pages(struct inode *inode,
1226                                         struct page *locked_page,
1227                                         u64 delalloc_start,
1228                                         u64 delalloc_end)
1229 {
1230         unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1231         unsigned long start_index = index;
1232         unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1233         unsigned long pages_locked = 0;
1234         struct page *pages[16];
1235         unsigned long nrpages;
1236         int ret;
1237         int i;
1238
1239         /* the caller is responsible for locking the start index */
1240         if (index == locked_page->index && index == end_index)
1241                 return 0;
1242
1243         /* skip the page at the start index */
1244         nrpages = end_index - index + 1;
1245         while (nrpages > 0) {
1246                 ret = find_get_pages_contig(inode->i_mapping, index,
1247                                      min_t(unsigned long,
1248                                      nrpages, ARRAY_SIZE(pages)), pages);
1249                 if (ret == 0) {
1250                         ret = -EAGAIN;
1251                         goto done;
1252                 }
1253                 /* now we have an array of pages, lock them all */
1254                 for (i = 0; i < ret; i++) {
1255                         /*
1256                          * the caller is taking responsibility for
1257                          * locked_page
1258                          */
1259                         if (pages[i] != locked_page) {
1260                                 lock_page(pages[i]);
1261                                 if (!PageDirty(pages[i]) ||
1262                                     pages[i]->mapping != inode->i_mapping) {
1263                                         ret = -EAGAIN;
1264                                         unlock_page(pages[i]);
1265                                         page_cache_release(pages[i]);
1266                                         goto done;
1267                                 }
1268                         }
1269                         page_cache_release(pages[i]);
1270                         pages_locked++;
1271                 }
1272                 nrpages -= ret;
1273                 index += ret;
1274                 cond_resched();
1275         }
1276         ret = 0;
1277 done:
1278         if (ret && pages_locked) {
1279                 __unlock_for_delalloc(inode, locked_page,
1280                               delalloc_start,
1281                               ((u64)(start_index + pages_locked - 1)) <<
1282                               PAGE_CACHE_SHIFT);
1283         }
1284         return ret;
1285 }
1286
1287 /*
1288  * find a contiguous range of bytes in the file marked as delalloc, not
1289  * more than 'max_bytes'.  start and end are used to return the range,
1290  *
1291  * 1 is returned if we find something, 0 if nothing was in the tree
1292  */
1293 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1294                                              struct extent_io_tree *tree,
1295                                              struct page *locked_page,
1296                                              u64 *start, u64 *end,
1297                                              u64 max_bytes)
1298 {
1299         u64 delalloc_start;
1300         u64 delalloc_end;
1301         u64 found;
1302         struct extent_state *cached_state = NULL;
1303         int ret;
1304         int loops = 0;
1305
1306 again:
1307         /* step one, find a bunch of delalloc bytes starting at start */
1308         delalloc_start = *start;
1309         delalloc_end = 0;
1310         found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1311                                     max_bytes, &cached_state);
1312         if (!found || delalloc_end <= *start) {
1313                 *start = delalloc_start;
1314                 *end = delalloc_end;
1315                 free_extent_state(cached_state);
1316                 return found;
1317         }
1318
1319         /*
1320          * start comes from the offset of locked_page.  We have to lock
1321          * pages in order, so we can't process delalloc bytes before
1322          * locked_page
1323          */
1324         if (delalloc_start < *start)
1325                 delalloc_start = *start;
1326
1327         /*
1328          * make sure to limit the number of pages we try to lock down
1329          * if we're looping.
1330          */
1331         if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1332                 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1333
1334         /* step two, lock all the pages after the page that has start */
1335         ret = lock_delalloc_pages(inode, locked_page,
1336                                   delalloc_start, delalloc_end);
1337         if (ret == -EAGAIN) {
1338                 /* some of the pages are gone, lets avoid looping by
1339                  * shortening the size of the delalloc range we're searching
1340                  */
1341                 free_extent_state(cached_state);
1342                 if (!loops) {
1343                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1344                         max_bytes = PAGE_CACHE_SIZE - offset;
1345                         loops = 1;
1346                         goto again;
1347                 } else {
1348                         found = 0;
1349                         goto out_failed;
1350                 }
1351         }
1352         BUG_ON(ret);
1353
1354         /* step three, lock the state bits for the whole range */
1355         lock_extent_bits(tree, delalloc_start, delalloc_end,
1356                          0, &cached_state, GFP_NOFS);
1357
1358         /* then test to make sure it is all still delalloc */
1359         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1360                              EXTENT_DELALLOC, 1, cached_state);
1361         if (!ret) {
1362                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1363                                      &cached_state, GFP_NOFS);
1364                 __unlock_for_delalloc(inode, locked_page,
1365                               delalloc_start, delalloc_end);
1366                 cond_resched();
1367                 goto again;
1368         }
1369         free_extent_state(cached_state);
1370         *start = delalloc_start;
1371         *end = delalloc_end;
1372 out_failed:
1373         return found;
1374 }
1375
1376 int extent_clear_unlock_delalloc(struct inode *inode,
1377                                 struct extent_io_tree *tree,
1378                                 u64 start, u64 end, struct page *locked_page,
1379                                 unsigned long op)
1380 {
1381         int ret;
1382         struct page *pages[16];
1383         unsigned long index = start >> PAGE_CACHE_SHIFT;
1384         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1385         unsigned long nr_pages = end_index - index + 1;
1386         int i;
1387         int clear_bits = 0;
1388
1389         if (op & EXTENT_CLEAR_UNLOCK)
1390                 clear_bits |= EXTENT_LOCKED;
1391         if (op & EXTENT_CLEAR_DIRTY)
1392                 clear_bits |= EXTENT_DIRTY;
1393
1394         if (op & EXTENT_CLEAR_DELALLOC)
1395                 clear_bits |= EXTENT_DELALLOC;
1396
1397         clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1398         if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1399                     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1400                     EXTENT_SET_PRIVATE2)))
1401                 return 0;
1402
1403         while (nr_pages > 0) {
1404                 ret = find_get_pages_contig(inode->i_mapping, index,
1405                                      min_t(unsigned long,
1406                                      nr_pages, ARRAY_SIZE(pages)), pages);
1407                 for (i = 0; i < ret; i++) {
1408
1409                         if (op & EXTENT_SET_PRIVATE2)
1410                                 SetPagePrivate2(pages[i]);
1411
1412                         if (pages[i] == locked_page) {
1413                                 page_cache_release(pages[i]);
1414                                 continue;
1415                         }
1416                         if (op & EXTENT_CLEAR_DIRTY)
1417                                 clear_page_dirty_for_io(pages[i]);
1418                         if (op & EXTENT_SET_WRITEBACK)
1419                                 set_page_writeback(pages[i]);
1420                         if (op & EXTENT_END_WRITEBACK)
1421                                 end_page_writeback(pages[i]);
1422                         if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1423                                 unlock_page(pages[i]);
1424                         page_cache_release(pages[i]);
1425                 }
1426                 nr_pages -= ret;
1427                 index += ret;
1428                 cond_resched();
1429         }
1430         return 0;
1431 }
1432
1433 /*
1434  * count the number of bytes in the tree that have a given bit(s)
1435  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1436  * cached.  The total number found is returned.
1437  */
1438 u64 count_range_bits(struct extent_io_tree *tree,
1439                      u64 *start, u64 search_end, u64 max_bytes,
1440                      unsigned long bits, int contig)
1441 {
1442         struct rb_node *node;
1443         struct extent_state *state;
1444         u64 cur_start = *start;
1445         u64 total_bytes = 0;
1446         u64 last = 0;
1447         int found = 0;
1448
1449         if (search_end <= cur_start) {
1450                 WARN_ON(1);
1451                 return 0;
1452         }
1453
1454         spin_lock(&tree->lock);
1455         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1456                 total_bytes = tree->dirty_bytes;
1457                 goto out;
1458         }
1459         /*
1460          * this search will find all the extents that end after
1461          * our range starts.
1462          */
1463         node = tree_search(tree, cur_start);
1464         if (!node)
1465                 goto out;
1466
1467         while (1) {
1468                 state = rb_entry(node, struct extent_state, rb_node);
1469                 if (state->start > search_end)
1470                         break;
1471                 if (contig && found && state->start > last + 1)
1472                         break;
1473                 if (state->end >= cur_start && (state->state & bits) == bits) {
1474                         total_bytes += min(search_end, state->end) + 1 -
1475                                        max(cur_start, state->start);
1476                         if (total_bytes >= max_bytes)
1477                                 break;
1478                         if (!found) {
1479                                 *start = max(cur_start, state->start);
1480                                 found = 1;
1481                         }
1482                         last = state->end;
1483                 } else if (contig && found) {
1484                         break;
1485                 }
1486                 node = rb_next(node);
1487                 if (!node)
1488                         break;
1489         }
1490 out:
1491         spin_unlock(&tree->lock);
1492         return total_bytes;
1493 }
1494
1495 /*
1496  * set the private field for a given byte offset in the tree.  If there isn't
1497  * an extent_state there already, this does nothing.
1498  */
1499 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1500 {
1501         struct rb_node *node;
1502         struct extent_state *state;
1503         int ret = 0;
1504
1505         spin_lock(&tree->lock);
1506         /*
1507          * this search will find all the extents that end after
1508          * our range starts.
1509          */
1510         node = tree_search(tree, start);
1511         if (!node) {
1512                 ret = -ENOENT;
1513                 goto out;
1514         }
1515         state = rb_entry(node, struct extent_state, rb_node);
1516         if (state->start != start) {
1517                 ret = -ENOENT;
1518                 goto out;
1519         }
1520         state->private = private;
1521 out:
1522         spin_unlock(&tree->lock);
1523         return ret;
1524 }
1525
1526 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1527 {
1528         struct rb_node *node;
1529         struct extent_state *state;
1530         int ret = 0;
1531
1532         spin_lock(&tree->lock);
1533         /*
1534          * this search will find all the extents that end after
1535          * our range starts.
1536          */
1537         node = tree_search(tree, start);
1538         if (!node) {
1539                 ret = -ENOENT;
1540                 goto out;
1541         }
1542         state = rb_entry(node, struct extent_state, rb_node);
1543         if (state->start != start) {
1544                 ret = -ENOENT;
1545                 goto out;
1546         }
1547         *private = state->private;
1548 out:
1549         spin_unlock(&tree->lock);
1550         return ret;
1551 }
1552
1553 /*
1554  * searches a range in the state tree for a given mask.
1555  * If 'filled' == 1, this returns 1 only if every extent in the tree
1556  * has the bits set.  Otherwise, 1 is returned if any bit in the
1557  * range is found set.
1558  */
1559 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1560                    int bits, int filled, struct extent_state *cached)
1561 {
1562         struct extent_state *state = NULL;
1563         struct rb_node *node;
1564         int bitset = 0;
1565
1566         spin_lock(&tree->lock);
1567         if (cached && cached->tree && cached->start == start)
1568                 node = &cached->rb_node;
1569         else
1570                 node = tree_search(tree, start);
1571         while (node && start <= end) {
1572                 state = rb_entry(node, struct extent_state, rb_node);
1573
1574                 if (filled && state->start > start) {
1575                         bitset = 0;
1576                         break;
1577                 }
1578
1579                 if (state->start > end)
1580                         break;
1581
1582                 if (state->state & bits) {
1583                         bitset = 1;
1584                         if (!filled)
1585                                 break;
1586                 } else if (filled) {
1587                         bitset = 0;
1588                         break;
1589                 }
1590
1591                 if (state->end == (u64)-1)
1592                         break;
1593
1594                 start = state->end + 1;
1595                 if (start > end)
1596                         break;
1597                 node = rb_next(node);
1598                 if (!node) {
1599                         if (filled)
1600                                 bitset = 0;
1601                         break;
1602                 }
1603         }
1604         spin_unlock(&tree->lock);
1605         return bitset;
1606 }
1607
1608 /*
1609  * helper function to set a given page up to date if all the
1610  * extents in the tree for that page are up to date
1611  */
1612 static int check_page_uptodate(struct extent_io_tree *tree,
1613                                struct page *page)
1614 {
1615         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1616         u64 end = start + PAGE_CACHE_SIZE - 1;
1617         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1618                 SetPageUptodate(page);
1619         return 0;
1620 }
1621
1622 /*
1623  * helper function to unlock a page if all the extents in the tree
1624  * for that page are unlocked
1625  */
1626 static int check_page_locked(struct extent_io_tree *tree,
1627                              struct page *page)
1628 {
1629         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1630         u64 end = start + PAGE_CACHE_SIZE - 1;
1631         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1632                 unlock_page(page);
1633         return 0;
1634 }
1635
1636 /*
1637  * helper function to end page writeback if all the extents
1638  * in the tree for that page are done with writeback
1639  */
1640 static int check_page_writeback(struct extent_io_tree *tree,
1641                              struct page *page)
1642 {
1643         end_page_writeback(page);
1644         return 0;
1645 }
1646
1647 /* lots and lots of room for performance fixes in the end_bio funcs */
1648
1649 /*
1650  * after a writepage IO is done, we need to:
1651  * clear the uptodate bits on error
1652  * clear the writeback bits in the extent tree for this IO
1653  * end_page_writeback if the page has no more pending IO
1654  *
1655  * Scheduling is not allowed, so the extent state tree is expected
1656  * to have one and only one object corresponding to this IO.
1657  */
1658 static void end_bio_extent_writepage(struct bio *bio, int err)
1659 {
1660         int uptodate = err == 0;
1661         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1662         struct extent_io_tree *tree;
1663         u64 start;
1664         u64 end;
1665         int whole_page;
1666         int ret;
1667
1668         do {
1669                 struct page *page = bvec->bv_page;
1670                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1671
1672                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1673                          bvec->bv_offset;
1674                 end = start + bvec->bv_len - 1;
1675
1676                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1677                         whole_page = 1;
1678                 else
1679                         whole_page = 0;
1680
1681                 if (--bvec >= bio->bi_io_vec)
1682                         prefetchw(&bvec->bv_page->flags);
1683                 if (tree->ops && tree->ops->writepage_end_io_hook) {
1684                         ret = tree->ops->writepage_end_io_hook(page, start,
1685                                                        end, NULL, uptodate);
1686                         if (ret)
1687                                 uptodate = 0;
1688                 }
1689
1690                 if (!uptodate && tree->ops &&
1691                     tree->ops->writepage_io_failed_hook) {
1692                         ret = tree->ops->writepage_io_failed_hook(bio, page,
1693                                                          start, end, NULL);
1694                         if (ret == 0) {
1695                                 uptodate = (err == 0);
1696                                 continue;
1697                         }
1698                 }
1699
1700                 if (!uptodate) {
1701                         clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1702                         ClearPageUptodate(page);
1703                         SetPageError(page);
1704                 }
1705
1706                 if (whole_page)
1707                         end_page_writeback(page);
1708                 else
1709                         check_page_writeback(tree, page);
1710         } while (bvec >= bio->bi_io_vec);
1711
1712         bio_put(bio);
1713 }
1714
1715 /*
1716  * after a readpage IO is done, we need to:
1717  * clear the uptodate bits on error
1718  * set the uptodate bits if things worked
1719  * set the page up to date if all extents in the tree are uptodate
1720  * clear the lock bit in the extent tree
1721  * unlock the page if there are no other extents locked for it
1722  *
1723  * Scheduling is not allowed, so the extent state tree is expected
1724  * to have one and only one object corresponding to this IO.
1725  */
1726 static void end_bio_extent_readpage(struct bio *bio, int err)
1727 {
1728         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1729         struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1730         struct bio_vec *bvec = bio->bi_io_vec;
1731         struct extent_io_tree *tree;
1732         u64 start;
1733         u64 end;
1734         int whole_page;
1735         int ret;
1736
1737         if (err)
1738                 uptodate = 0;
1739
1740         do {
1741                 struct page *page = bvec->bv_page;
1742                 struct extent_state *cached = NULL;
1743                 struct extent_state *state;
1744
1745                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1746
1747                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1748                         bvec->bv_offset;
1749                 end = start + bvec->bv_len - 1;
1750
1751                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1752                         whole_page = 1;
1753                 else
1754                         whole_page = 0;
1755
1756                 if (++bvec <= bvec_end)
1757                         prefetchw(&bvec->bv_page->flags);
1758
1759                 spin_lock(&tree->lock);
1760                 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1761                 if (state && state->start == start) {
1762                         /*
1763                          * take a reference on the state, unlock will drop
1764                          * the ref
1765                          */
1766                         cache_state(state, &cached);
1767                 }
1768                 spin_unlock(&tree->lock);
1769
1770                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1771                         ret = tree->ops->readpage_end_io_hook(page, start, end,
1772                                                               state);
1773                         if (ret)
1774                                 uptodate = 0;
1775                 }
1776                 if (!uptodate && tree->ops &&
1777                     tree->ops->readpage_io_failed_hook) {
1778                         ret = tree->ops->readpage_io_failed_hook(bio, page,
1779                                                          start, end, NULL);
1780                         if (ret == 0) {
1781                                 uptodate =
1782                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
1783                                 if (err)
1784                                         uptodate = 0;
1785                                 uncache_state(&cached);
1786                                 continue;
1787                         }
1788                 }
1789
1790                 if (uptodate) {
1791                         set_extent_uptodate(tree, start, end, &cached,
1792                                             GFP_ATOMIC);
1793                 }
1794                 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1795
1796                 if (whole_page) {
1797                         if (uptodate) {
1798                                 SetPageUptodate(page);
1799                         } else {
1800                                 ClearPageUptodate(page);
1801                                 SetPageError(page);
1802                         }
1803                         unlock_page(page);
1804                 } else {
1805                         if (uptodate) {
1806                                 check_page_uptodate(tree, page);
1807                         } else {
1808                                 ClearPageUptodate(page);
1809                                 SetPageError(page);
1810                         }
1811                         check_page_locked(tree, page);
1812                 }
1813         } while (bvec <= bvec_end);
1814
1815         bio_put(bio);
1816 }
1817
1818 struct bio *
1819 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1820                 gfp_t gfp_flags)
1821 {
1822         struct bio *bio;
1823
1824         bio = bio_alloc(gfp_flags, nr_vecs);
1825
1826         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1827                 while (!bio && (nr_vecs /= 2))
1828                         bio = bio_alloc(gfp_flags, nr_vecs);
1829         }
1830
1831         if (bio) {
1832                 bio->bi_size = 0;
1833                 bio->bi_bdev = bdev;
1834                 bio->bi_sector = first_sector;
1835         }
1836         return bio;
1837 }
1838
1839 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1840                           unsigned long bio_flags)
1841 {
1842         int ret = 0;
1843         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1844         struct page *page = bvec->bv_page;
1845         struct extent_io_tree *tree = bio->bi_private;
1846         u64 start;
1847
1848         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1849
1850         bio->bi_private = NULL;
1851
1852         bio_get(bio);
1853
1854         if (tree->ops && tree->ops->submit_bio_hook)
1855                 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1856                                            mirror_num, bio_flags, start);
1857         else
1858                 submit_bio(rw, bio);
1859         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1860                 ret = -EOPNOTSUPP;
1861         bio_put(bio);
1862         return ret;
1863 }
1864
1865 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1866                               struct page *page, sector_t sector,
1867                               size_t size, unsigned long offset,
1868                               struct block_device *bdev,
1869                               struct bio **bio_ret,
1870                               unsigned long max_pages,
1871                               bio_end_io_t end_io_func,
1872                               int mirror_num,
1873                               unsigned long prev_bio_flags,
1874                               unsigned long bio_flags)
1875 {
1876         int ret = 0;
1877         struct bio *bio;
1878         int nr;
1879         int contig = 0;
1880         int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1881         int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1882         size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1883
1884         if (bio_ret && *bio_ret) {
1885                 bio = *bio_ret;
1886                 if (old_compressed)
1887                         contig = bio->bi_sector == sector;
1888                 else
1889                         contig = bio->bi_sector + (bio->bi_size >> 9) ==
1890                                 sector;
1891
1892                 if (prev_bio_flags != bio_flags || !contig ||
1893                     (tree->ops && tree->ops->merge_bio_hook &&
1894                      tree->ops->merge_bio_hook(page, offset, page_size, bio,
1895                                                bio_flags)) ||
1896                     bio_add_page(bio, page, page_size, offset) < page_size) {
1897                         ret = submit_one_bio(rw, bio, mirror_num,
1898                                              prev_bio_flags);
1899                         bio = NULL;
1900                 } else {
1901                         return 0;
1902                 }
1903         }
1904         if (this_compressed)
1905                 nr = BIO_MAX_PAGES;
1906         else
1907                 nr = bio_get_nr_vecs(bdev);
1908
1909         bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1910         if (!bio)
1911                 return -ENOMEM;
1912
1913         bio_add_page(bio, page, page_size, offset);
1914         bio->bi_end_io = end_io_func;
1915         bio->bi_private = tree;
1916
1917         if (bio_ret)
1918                 *bio_ret = bio;
1919         else
1920                 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1921
1922         return ret;
1923 }
1924
1925 void set_page_extent_mapped(struct page *page)
1926 {
1927         if (!PagePrivate(page)) {
1928                 SetPagePrivate(page);
1929                 page_cache_get(page);
1930                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1931         }
1932 }
1933
1934 static void set_page_extent_head(struct page *page, unsigned long len)
1935 {
1936         WARN_ON(!PagePrivate(page));
1937         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1938 }
1939
1940 /*
1941  * basic readpage implementation.  Locked extent state structs are inserted
1942  * into the tree that are removed when the IO is done (by the end_io
1943  * handlers)
1944  */
1945 static int __extent_read_full_page(struct extent_io_tree *tree,
1946                                    struct page *page,
1947                                    get_extent_t *get_extent,
1948                                    struct bio **bio, int mirror_num,
1949                                    unsigned long *bio_flags)
1950 {
1951         struct inode *inode = page->mapping->host;
1952         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1953         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1954         u64 end;
1955         u64 cur = start;
1956         u64 extent_offset;
1957         u64 last_byte = i_size_read(inode);
1958         u64 block_start;
1959         u64 cur_end;
1960         sector_t sector;
1961         struct extent_map *em;
1962         struct block_device *bdev;
1963         struct btrfs_ordered_extent *ordered;
1964         int ret;
1965         int nr = 0;
1966         size_t pg_offset = 0;
1967         size_t iosize;
1968         size_t disk_io_size;
1969         size_t blocksize = inode->i_sb->s_blocksize;
1970         unsigned long this_bio_flag = 0;
1971
1972         set_page_extent_mapped(page);
1973
1974         if (!PageUptodate(page)) {
1975                 if (cleancache_get_page(page) == 0) {
1976                         BUG_ON(blocksize != PAGE_SIZE);
1977                         goto out;
1978                 }
1979         }
1980
1981         end = page_end;
1982         while (1) {
1983                 lock_extent(tree, start, end, GFP_NOFS);
1984                 ordered = btrfs_lookup_ordered_extent(inode, start);
1985                 if (!ordered)
1986                         break;
1987                 unlock_extent(tree, start, end, GFP_NOFS);
1988                 btrfs_start_ordered_extent(inode, ordered, 1);
1989                 btrfs_put_ordered_extent(ordered);
1990         }
1991
1992         if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1993                 char *userpage;
1994                 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1995
1996                 if (zero_offset) {
1997                         iosize = PAGE_CACHE_SIZE - zero_offset;
1998                         userpage = kmap_atomic(page, KM_USER0);
1999                         memset(userpage + zero_offset, 0, iosize);
2000                         flush_dcache_page(page);
2001                         kunmap_atomic(userpage, KM_USER0);
2002                 }
2003         }
2004         while (cur <= end) {
2005                 if (cur >= last_byte) {
2006                         char *userpage;
2007                         struct extent_state *cached = NULL;
2008
2009                         iosize = PAGE_CACHE_SIZE - pg_offset;
2010                         userpage = kmap_atomic(page, KM_USER0);
2011                         memset(userpage + pg_offset, 0, iosize);
2012                         flush_dcache_page(page);
2013                         kunmap_atomic(userpage, KM_USER0);
2014                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2015                                             &cached, GFP_NOFS);
2016                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2017                                              &cached, GFP_NOFS);
2018                         break;
2019                 }
2020                 em = get_extent(inode, page, pg_offset, cur,
2021                                 end - cur + 1, 0);
2022                 if (IS_ERR_OR_NULL(em)) {
2023                         SetPageError(page);
2024                         unlock_extent(tree, cur, end, GFP_NOFS);
2025                         break;
2026                 }
2027                 extent_offset = cur - em->start;
2028                 BUG_ON(extent_map_end(em) <= cur);
2029                 BUG_ON(end < cur);
2030
2031                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2032                         this_bio_flag = EXTENT_BIO_COMPRESSED;
2033                         extent_set_compress_type(&this_bio_flag,
2034                                                  em->compress_type);
2035                 }
2036
2037                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2038                 cur_end = min(extent_map_end(em) - 1, end);
2039                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2040                 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2041                         disk_io_size = em->block_len;
2042                         sector = em->block_start >> 9;
2043                 } else {
2044                         sector = (em->block_start + extent_offset) >> 9;
2045                         disk_io_size = iosize;
2046                 }
2047                 bdev = em->bdev;
2048                 block_start = em->block_start;
2049                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2050                         block_start = EXTENT_MAP_HOLE;
2051                 free_extent_map(em);
2052                 em = NULL;
2053
2054                 /* we've found a hole, just zero and go on */
2055                 if (block_start == EXTENT_MAP_HOLE) {
2056                         char *userpage;
2057                         struct extent_state *cached = NULL;
2058
2059                         userpage = kmap_atomic(page, KM_USER0);
2060                         memset(userpage + pg_offset, 0, iosize);
2061                         flush_dcache_page(page);
2062                         kunmap_atomic(userpage, KM_USER0);
2063
2064                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2065                                             &cached, GFP_NOFS);
2066                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2067                                              &cached, GFP_NOFS);
2068                         cur = cur + iosize;
2069                         pg_offset += iosize;
2070                         continue;
2071                 }
2072                 /* the get_extent function already copied into the page */
2073                 if (test_range_bit(tree, cur, cur_end,
2074                                    EXTENT_UPTODATE, 1, NULL)) {
2075                         check_page_uptodate(tree, page);
2076                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2077                         cur = cur + iosize;
2078                         pg_offset += iosize;
2079                         continue;
2080                 }
2081                 /* we have an inline extent but it didn't get marked up
2082                  * to date.  Error out
2083                  */
2084                 if (block_start == EXTENT_MAP_INLINE) {
2085                         SetPageError(page);
2086                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2087                         cur = cur + iosize;
2088                         pg_offset += iosize;
2089                         continue;
2090                 }
2091
2092                 ret = 0;
2093                 if (tree->ops && tree->ops->readpage_io_hook) {
2094                         ret = tree->ops->readpage_io_hook(page, cur,
2095                                                           cur + iosize - 1);
2096                 }
2097                 if (!ret) {
2098                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2099                         pnr -= page->index;
2100                         ret = submit_extent_page(READ, tree, page,
2101                                          sector, disk_io_size, pg_offset,
2102                                          bdev, bio, pnr,
2103                                          end_bio_extent_readpage, mirror_num,
2104                                          *bio_flags,
2105                                          this_bio_flag);
2106                         nr++;
2107                         *bio_flags = this_bio_flag;
2108                 }
2109                 if (ret)
2110                         SetPageError(page);
2111                 cur = cur + iosize;
2112                 pg_offset += iosize;
2113         }
2114 out:
2115         if (!nr) {
2116                 if (!PageError(page))
2117                         SetPageUptodate(page);
2118                 unlock_page(page);
2119         }
2120         return 0;
2121 }
2122
2123 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2124                             get_extent_t *get_extent)
2125 {
2126         struct bio *bio = NULL;
2127         unsigned long bio_flags = 0;
2128         int ret;
2129
2130         ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2131                                       &bio_flags);
2132         if (bio)
2133                 ret = submit_one_bio(READ, bio, 0, bio_flags);
2134         return ret;
2135 }
2136
2137 static noinline void update_nr_written(struct page *page,
2138                                       struct writeback_control *wbc,
2139                                       unsigned long nr_written)
2140 {
2141         wbc->nr_to_write -= nr_written;
2142         if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2143             wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2144                 page->mapping->writeback_index = page->index + nr_written;
2145 }
2146
2147 /*
2148  * the writepage semantics are similar to regular writepage.  extent
2149  * records are inserted to lock ranges in the tree, and as dirty areas
2150  * are found, they are marked writeback.  Then the lock bits are removed
2151  * and the end_io handler clears the writeback ranges
2152  */
2153 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2154                               void *data)
2155 {
2156         struct inode *inode = page->mapping->host;
2157         struct extent_page_data *epd = data;
2158         struct extent_io_tree *tree = epd->tree;
2159         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2160         u64 delalloc_start;
2161         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2162         u64 end;
2163         u64 cur = start;
2164         u64 extent_offset;
2165         u64 last_byte = i_size_read(inode);
2166         u64 block_start;
2167         u64 iosize;
2168         sector_t sector;
2169         struct extent_state *cached_state = NULL;
2170         struct extent_map *em;
2171         struct block_device *bdev;
2172         int ret;
2173         int nr = 0;
2174         size_t pg_offset = 0;
2175         size_t blocksize;
2176         loff_t i_size = i_size_read(inode);
2177         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2178         u64 nr_delalloc;
2179         u64 delalloc_end;
2180         int page_started;
2181         int compressed;
2182         int write_flags;
2183         unsigned long nr_written = 0;
2184
2185         if (wbc->sync_mode == WB_SYNC_ALL)
2186                 write_flags = WRITE_SYNC;
2187         else
2188                 write_flags = WRITE;
2189
2190         trace___extent_writepage(page, inode, wbc);
2191
2192         WARN_ON(!PageLocked(page));
2193         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2194         if (page->index > end_index ||
2195            (page->index == end_index && !pg_offset)) {
2196                 page->mapping->a_ops->invalidatepage(page, 0);
2197                 unlock_page(page);
2198                 return 0;
2199         }
2200
2201         if (page->index == end_index) {
2202                 char *userpage;
2203
2204                 userpage = kmap_atomic(page, KM_USER0);
2205                 memset(userpage + pg_offset, 0,
2206                        PAGE_CACHE_SIZE - pg_offset);
2207                 kunmap_atomic(userpage, KM_USER0);
2208                 flush_dcache_page(page);
2209         }
2210         pg_offset = 0;
2211
2212         set_page_extent_mapped(page);
2213
2214         delalloc_start = start;
2215         delalloc_end = 0;
2216         page_started = 0;
2217         if (!epd->extent_locked) {
2218                 u64 delalloc_to_write = 0;
2219                 /*
2220                  * make sure the wbc mapping index is at least updated
2221                  * to this page.
2222                  */
2223                 update_nr_written(page, wbc, 0);
2224
2225                 while (delalloc_end < page_end) {
2226                         nr_delalloc = find_lock_delalloc_range(inode, tree,
2227                                                        page,
2228                                                        &delalloc_start,
2229                                                        &delalloc_end,
2230                                                        128 * 1024 * 1024);
2231                         if (nr_delalloc == 0) {
2232                                 delalloc_start = delalloc_end + 1;
2233                                 continue;
2234                         }
2235                         tree->ops->fill_delalloc(inode, page, delalloc_start,
2236                                                  delalloc_end, &page_started,
2237                                                  &nr_written);
2238                         /*
2239                          * delalloc_end is already one less than the total
2240                          * length, so we don't subtract one from
2241                          * PAGE_CACHE_SIZE
2242                          */
2243                         delalloc_to_write += (delalloc_end - delalloc_start +
2244                                               PAGE_CACHE_SIZE) >>
2245                                               PAGE_CACHE_SHIFT;
2246                         delalloc_start = delalloc_end + 1;
2247                 }
2248                 if (wbc->nr_to_write < delalloc_to_write) {
2249                         int thresh = 8192;
2250
2251                         if (delalloc_to_write < thresh * 2)
2252                                 thresh = delalloc_to_write;
2253                         wbc->nr_to_write = min_t(u64, delalloc_to_write,
2254                                                  thresh);
2255                 }
2256
2257                 /* did the fill delalloc function already unlock and start
2258                  * the IO?
2259                  */
2260                 if (page_started) {
2261                         ret = 0;
2262                         /*
2263                          * we've unlocked the page, so we can't update
2264                          * the mapping's writeback index, just update
2265                          * nr_to_write.
2266                          */
2267                         wbc->nr_to_write -= nr_written;
2268                         goto done_unlocked;
2269                 }
2270         }
2271         if (tree->ops && tree->ops->writepage_start_hook) {
2272                 ret = tree->ops->writepage_start_hook(page, start,
2273                                                       page_end);
2274                 if (ret == -EAGAIN) {
2275                         redirty_page_for_writepage(wbc, page);
2276                         update_nr_written(page, wbc, nr_written);
2277                         unlock_page(page);
2278                         ret = 0;
2279                         goto done_unlocked;
2280                 }
2281         }
2282
2283         /*
2284          * we don't want to touch the inode after unlocking the page,
2285          * so we update the mapping writeback index now
2286          */
2287         update_nr_written(page, wbc, nr_written + 1);
2288
2289         end = page_end;
2290         if (last_byte <= start) {
2291                 if (tree->ops && tree->ops->writepage_end_io_hook)
2292                         tree->ops->writepage_end_io_hook(page, start,
2293                                                          page_end, NULL, 1);
2294                 goto done;
2295         }
2296
2297         blocksize = inode->i_sb->s_blocksize;
2298
2299         while (cur <= end) {
2300                 if (cur >= last_byte) {
2301                         if (tree->ops && tree->ops->writepage_end_io_hook)
2302                                 tree->ops->writepage_end_io_hook(page, cur,
2303                                                          page_end, NULL, 1);
2304                         break;
2305                 }
2306                 em = epd->get_extent(inode, page, pg_offset, cur,
2307                                      end - cur + 1, 1);
2308                 if (IS_ERR_OR_NULL(em)) {
2309                         SetPageError(page);
2310                         break;
2311                 }
2312
2313                 extent_offset = cur - em->start;
2314                 BUG_ON(extent_map_end(em) <= cur);
2315                 BUG_ON(end < cur);
2316                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2317                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2318                 sector = (em->block_start + extent_offset) >> 9;
2319                 bdev = em->bdev;
2320                 block_start = em->block_start;
2321                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2322                 free_extent_map(em);
2323                 em = NULL;
2324
2325                 /*
2326                  * compressed and inline extents are written through other
2327                  * paths in the FS
2328                  */
2329                 if (compressed || block_start == EXTENT_MAP_HOLE ||
2330                     block_start == EXTENT_MAP_INLINE) {
2331                         /*
2332                          * end_io notification does not happen here for
2333                          * compressed extents
2334                          */
2335                         if (!compressed && tree->ops &&
2336                             tree->ops->writepage_end_io_hook)
2337                                 tree->ops->writepage_end_io_hook(page, cur,
2338                                                          cur + iosize - 1,
2339                                                          NULL, 1);
2340                         else if (compressed) {
2341                                 /* we don't want to end_page_writeback on
2342                                  * a compressed extent.  this happens
2343                                  * elsewhere
2344                                  */
2345                                 nr++;
2346                         }
2347
2348                         cur += iosize;
2349                         pg_offset += iosize;
2350                         continue;
2351                 }
2352                 /* leave this out until we have a page_mkwrite call */
2353                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2354                                    EXTENT_DIRTY, 0, NULL)) {
2355                         cur = cur + iosize;
2356                         pg_offset += iosize;
2357                         continue;
2358                 }
2359
2360                 if (tree->ops && tree->ops->writepage_io_hook) {
2361                         ret = tree->ops->writepage_io_hook(page, cur,
2362                                                 cur + iosize - 1);
2363                 } else {
2364                         ret = 0;
2365                 }
2366                 if (ret) {
2367                         SetPageError(page);
2368                 } else {
2369                         unsigned long max_nr = end_index + 1;
2370
2371                         set_range_writeback(tree, cur, cur + iosize - 1);
2372                         if (!PageWriteback(page)) {
2373                                 printk(KERN_ERR "btrfs warning page %lu not "
2374                                        "writeback, cur %llu end %llu\n",
2375                                        page->index, (unsigned long long)cur,
2376                                        (unsigned long long)end);
2377                         }
2378
2379                         ret = submit_extent_page(write_flags, tree, page,
2380                                                  sector, iosize, pg_offset,
2381                                                  bdev, &epd->bio, max_nr,
2382                                                  end_bio_extent_writepage,
2383                                                  0, 0, 0);
2384                         if (ret)
2385                                 SetPageError(page);
2386                 }
2387                 cur = cur + iosize;
2388                 pg_offset += iosize;
2389                 nr++;
2390         }
2391 done:
2392         if (nr == 0) {
2393                 /* make sure the mapping tag for page dirty gets cleared */
2394                 set_page_writeback(page);
2395                 end_page_writeback(page);
2396         }
2397         unlock_page(page);
2398
2399 done_unlocked:
2400
2401         /* drop our reference on any cached states */
2402         free_extent_state(cached_state);
2403         return 0;
2404 }
2405
2406 /**
2407  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2408  * @mapping: address space structure to write
2409  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2410  * @writepage: function called for each page
2411  * @data: data passed to writepage function
2412  *
2413  * If a page is already under I/O, write_cache_pages() skips it, even
2414  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2415  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2416  * and msync() need to guarantee that all the data which was dirty at the time
2417  * the call was made get new I/O started against them.  If wbc->sync_mode is
2418  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2419  * existing IO to complete.
2420  */
2421 static int extent_write_cache_pages(struct extent_io_tree *tree,
2422                              struct address_space *mapping,
2423                              struct writeback_control *wbc,
2424                              writepage_t writepage, void *data,
2425                              void (*flush_fn)(void *))
2426 {
2427         int ret = 0;
2428         int done = 0;
2429         int nr_to_write_done = 0;
2430         struct pagevec pvec;
2431         int nr_pages;
2432         pgoff_t index;
2433         pgoff_t end;            /* Inclusive */
2434         int scanned = 0;
2435
2436         pagevec_init(&pvec, 0);
2437         if (wbc->range_cyclic) {
2438                 index = mapping->writeback_index; /* Start from prev offset */
2439                 end = -1;
2440         } else {
2441                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2442                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2443                 scanned = 1;
2444         }
2445 retry:
2446         while (!done && !nr_to_write_done && (index <= end) &&
2447                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2448                               PAGECACHE_TAG_DIRTY, min(end - index,
2449                                   (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2450                 unsigned i;
2451
2452                 scanned = 1;
2453                 for (i = 0; i < nr_pages; i++) {
2454                         struct page *page = pvec.pages[i];
2455
2456                         /*
2457                          * At this point we hold neither mapping->tree_lock nor
2458                          * lock on the page itself: the page may be truncated or
2459                          * invalidated (changing page->mapping to NULL), or even
2460                          * swizzled back from swapper_space to tmpfs file
2461                          * mapping
2462                          */
2463                         if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2464                                 tree->ops->write_cache_pages_lock_hook(page);
2465                         else
2466                                 lock_page(page);
2467
2468                         if (unlikely(page->mapping != mapping)) {
2469                                 unlock_page(page);
2470                                 continue;
2471                         }
2472
2473                         if (!wbc->range_cyclic && page->index > end) {
2474                                 done = 1;
2475                                 unlock_page(page);
2476                                 continue;
2477                         }
2478
2479                         if (wbc->sync_mode != WB_SYNC_NONE) {
2480                                 if (PageWriteback(page))
2481                                         flush_fn(data);
2482                                 wait_on_page_writeback(page);
2483                         }
2484
2485                         if (PageWriteback(page) ||
2486                             !clear_page_dirty_for_io(page)) {
2487                                 unlock_page(page);
2488                                 continue;
2489                         }
2490
2491                         ret = (*writepage)(page, wbc, data);
2492
2493                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2494                                 unlock_page(page);
2495                                 ret = 0;
2496                         }
2497                         if (ret)
2498                                 done = 1;
2499
2500                         /*
2501                          * the filesystem may choose to bump up nr_to_write.
2502                          * We have to make sure to honor the new nr_to_write
2503                          * at any time
2504                          */
2505                         nr_to_write_done = wbc->nr_to_write <= 0;
2506                 }
2507                 pagevec_release(&pvec);
2508                 cond_resched();
2509         }
2510         if (!scanned && !done) {
2511                 /*
2512                  * We hit the last page and there is more work to be done: wrap
2513                  * back to the start of the file
2514                  */
2515                 scanned = 1;
2516                 index = 0;
2517                 goto retry;
2518         }
2519         return ret;
2520 }
2521
2522 static void flush_epd_write_bio(struct extent_page_data *epd)
2523 {
2524         if (epd->bio) {
2525                 if (epd->sync_io)
2526                         submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2527                 else
2528                         submit_one_bio(WRITE, epd->bio, 0, 0);
2529                 epd->bio = NULL;
2530         }
2531 }
2532
2533 static noinline void flush_write_bio(void *data)
2534 {
2535         struct extent_page_data *epd = data;
2536         flush_epd_write_bio(epd);
2537 }
2538
2539 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2540                           get_extent_t *get_extent,
2541                           struct writeback_control *wbc)
2542 {
2543         int ret;
2544         struct address_space *mapping = page->mapping;
2545         struct extent_page_data epd = {
2546                 .bio = NULL,
2547                 .tree = tree,
2548                 .get_extent = get_extent,
2549                 .extent_locked = 0,
2550                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2551         };
2552         struct writeback_control wbc_writepages = {
2553                 .sync_mode      = wbc->sync_mode,
2554                 .older_than_this = NULL,
2555                 .nr_to_write    = 64,
2556                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2557                 .range_end      = (loff_t)-1,
2558         };
2559
2560         ret = __extent_writepage(page, wbc, &epd);
2561
2562         extent_write_cache_pages(tree, mapping, &wbc_writepages,
2563                                  __extent_writepage, &epd, flush_write_bio);
2564         flush_epd_write_bio(&epd);
2565         return ret;
2566 }
2567
2568 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2569                               u64 start, u64 end, get_extent_t *get_extent,
2570                               int mode)
2571 {
2572         int ret = 0;
2573         struct address_space *mapping = inode->i_mapping;
2574         struct page *page;
2575         unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2576                 PAGE_CACHE_SHIFT;
2577
2578         struct extent_page_data epd = {
2579                 .bio = NULL,
2580                 .tree = tree,
2581                 .get_extent = get_extent,
2582                 .extent_locked = 1,
2583                 .sync_io = mode == WB_SYNC_ALL,
2584         };
2585         struct writeback_control wbc_writepages = {
2586                 .sync_mode      = mode,
2587                 .older_than_this = NULL,
2588                 .nr_to_write    = nr_pages * 2,
2589                 .range_start    = start,
2590                 .range_end      = end + 1,
2591         };
2592
2593         while (start <= end) {
2594                 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2595                 if (clear_page_dirty_for_io(page))
2596                         ret = __extent_writepage(page, &wbc_writepages, &epd);
2597                 else {
2598                         if (tree->ops && tree->ops->writepage_end_io_hook)
2599                                 tree->ops->writepage_end_io_hook(page, start,
2600                                                  start + PAGE_CACHE_SIZE - 1,
2601                                                  NULL, 1);
2602                         unlock_page(page);
2603                 }
2604                 page_cache_release(page);
2605                 start += PAGE_CACHE_SIZE;
2606         }
2607
2608         flush_epd_write_bio(&epd);
2609         return ret;
2610 }
2611
2612 int extent_writepages(struct extent_io_tree *tree,
2613                       struct address_space *mapping,
2614                       get_extent_t *get_extent,
2615                       struct writeback_control *wbc)
2616 {
2617         int ret = 0;
2618         struct extent_page_data epd = {
2619                 .bio = NULL,
2620                 .tree = tree,
2621                 .get_extent = get_extent,
2622                 .extent_locked = 0,
2623                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2624         };
2625
2626         ret = extent_write_cache_pages(tree, mapping, wbc,
2627                                        __extent_writepage, &epd,
2628                                        flush_write_bio);
2629         flush_epd_write_bio(&epd);
2630         return ret;
2631 }
2632
2633 int extent_readpages(struct extent_io_tree *tree,
2634                      struct address_space *mapping,
2635                      struct list_head *pages, unsigned nr_pages,
2636                      get_extent_t get_extent)
2637 {
2638         struct bio *bio = NULL;
2639         unsigned page_idx;
2640         unsigned long bio_flags = 0;
2641
2642         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2643                 struct page *page = list_entry(pages->prev, struct page, lru);
2644
2645                 prefetchw(&page->flags);
2646                 list_del(&page->lru);
2647                 if (!add_to_page_cache_lru(page, mapping,
2648                                         page->index, GFP_NOFS)) {
2649                         __extent_read_full_page(tree, page, get_extent,
2650                                                 &bio, 0, &bio_flags);
2651                 }
2652                 page_cache_release(page);
2653         }
2654         BUG_ON(!list_empty(pages));
2655         if (bio)
2656                 submit_one_bio(READ, bio, 0, bio_flags);
2657         return 0;
2658 }
2659
2660 /*
2661  * basic invalidatepage code, this waits on any locked or writeback
2662  * ranges corresponding to the page, and then deletes any extent state
2663  * records from the tree
2664  */
2665 int extent_invalidatepage(struct extent_io_tree *tree,
2666                           struct page *page, unsigned long offset)
2667 {
2668         struct extent_state *cached_state = NULL;
2669         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2670         u64 end = start + PAGE_CACHE_SIZE - 1;
2671         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2672
2673         start += (offset + blocksize - 1) & ~(blocksize - 1);
2674         if (start > end)
2675                 return 0;
2676
2677         lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2678         wait_on_page_writeback(page);
2679         clear_extent_bit(tree, start, end,
2680                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2681                          EXTENT_DO_ACCOUNTING,
2682                          1, 1, &cached_state, GFP_NOFS);
2683         return 0;
2684 }
2685
2686 /*
2687  * a helper for releasepage, this tests for areas of the page that
2688  * are locked or under IO and drops the related state bits if it is safe
2689  * to drop the page.
2690  */
2691 int try_release_extent_state(struct extent_map_tree *map,
2692                              struct extent_io_tree *tree, struct page *page,
2693                              gfp_t mask)
2694 {
2695         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2696         u64 end = start + PAGE_CACHE_SIZE - 1;
2697         int ret = 1;
2698
2699         if (test_range_bit(tree, start, end,
2700                            EXTENT_IOBITS, 0, NULL))
2701                 ret = 0;
2702         else {
2703                 if ((mask & GFP_NOFS) == GFP_NOFS)
2704                         mask = GFP_NOFS;
2705                 /*
2706                  * at this point we can safely clear everything except the
2707                  * locked bit and the nodatasum bit
2708                  */
2709                 ret = clear_extent_bit(tree, start, end,
2710                                  ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2711                                  0, 0, NULL, mask);
2712
2713                 /* if clear_extent_bit failed for enomem reasons,
2714                  * we can't allow the release to continue.
2715                  */
2716                 if (ret < 0)
2717                         ret = 0;
2718                 else
2719                         ret = 1;
2720         }
2721         return ret;
2722 }
2723
2724 /*
2725  * a helper for releasepage.  As long as there are no locked extents
2726  * in the range corresponding to the page, both state records and extent
2727  * map records are removed
2728  */
2729 int try_release_extent_mapping(struct extent_map_tree *map,
2730                                struct extent_io_tree *tree, struct page *page,
2731                                gfp_t mask)
2732 {
2733         struct extent_map *em;
2734         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2735         u64 end = start + PAGE_CACHE_SIZE - 1;
2736
2737         if ((mask & __GFP_WAIT) &&
2738             page->mapping->host->i_size > 16 * 1024 * 1024) {
2739                 u64 len;
2740                 while (start <= end) {
2741                         len = end - start + 1;
2742                         write_lock(&map->lock);
2743                         em = lookup_extent_mapping(map, start, len);
2744                         if (IS_ERR_OR_NULL(em)) {
2745                                 write_unlock(&map->lock);
2746                                 break;
2747                         }
2748                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2749                             em->start != start) {
2750                                 write_unlock(&map->lock);
2751                                 free_extent_map(em);
2752                                 break;
2753                         }
2754                         if (!test_range_bit(tree, em->start,
2755                                             extent_map_end(em) - 1,
2756                                             EXTENT_LOCKED | EXTENT_WRITEBACK,
2757                                             0, NULL)) {
2758                                 remove_extent_mapping(map, em);
2759                                 /* once for the rb tree */
2760                                 free_extent_map(em);
2761                         }
2762                         start = extent_map_end(em);
2763                         write_unlock(&map->lock);
2764
2765                         /* once for us */
2766                         free_extent_map(em);
2767                 }
2768         }
2769         return try_release_extent_state(map, tree, page, mask);
2770 }
2771
2772 /*
2773  * helper function for fiemap, which doesn't want to see any holes.
2774  * This maps until we find something past 'last'
2775  */
2776 static struct extent_map *get_extent_skip_holes(struct inode *inode,
2777                                                 u64 offset,
2778                                                 u64 last,
2779                                                 get_extent_t *get_extent)
2780 {
2781         u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2782         struct extent_map *em;
2783         u64 len;
2784
2785         if (offset >= last)
2786                 return NULL;
2787
2788         while(1) {
2789                 len = last - offset;
2790                 if (len == 0)
2791                         break;
2792                 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2793                 em = get_extent(inode, NULL, 0, offset, len, 0);
2794                 if (IS_ERR_OR_NULL(em))
2795                         return em;
2796
2797                 /* if this isn't a hole return it */
2798                 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2799                     em->block_start != EXTENT_MAP_HOLE) {
2800                         return em;
2801                 }
2802
2803                 /* this is a hole, advance to the next extent */
2804                 offset = extent_map_end(em);
2805                 free_extent_map(em);
2806                 if (offset >= last)
2807                         break;
2808         }
2809         return NULL;
2810 }
2811
2812 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2813                 __u64 start, __u64 len, get_extent_t *get_extent)
2814 {
2815         int ret = 0;
2816         u64 off = start;
2817         u64 max = start + len;
2818         u32 flags = 0;
2819         u32 found_type;
2820         u64 last;
2821         u64 last_for_get_extent = 0;
2822         u64 disko = 0;
2823         u64 isize = i_size_read(inode);
2824         struct btrfs_key found_key;
2825         struct extent_map *em = NULL;
2826         struct extent_state *cached_state = NULL;
2827         struct btrfs_path *path;
2828         struct btrfs_file_extent_item *item;
2829         int end = 0;
2830         u64 em_start = 0;
2831         u64 em_len = 0;
2832         u64 em_end = 0;
2833         unsigned long emflags;
2834
2835         if (len == 0)
2836                 return -EINVAL;
2837
2838         path = btrfs_alloc_path();
2839         if (!path)
2840                 return -ENOMEM;
2841         path->leave_spinning = 1;
2842
2843         /*
2844          * lookup the last file extent.  We're not using i_size here
2845          * because there might be preallocation past i_size
2846          */
2847         ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2848                                        path, btrfs_ino(inode), -1, 0);
2849         if (ret < 0) {
2850                 btrfs_free_path(path);
2851                 return ret;
2852         }
2853         WARN_ON(!ret);
2854         path->slots[0]--;
2855         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2856                               struct btrfs_file_extent_item);
2857         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2858         found_type = btrfs_key_type(&found_key);
2859
2860         /* No extents, but there might be delalloc bits */
2861         if (found_key.objectid != btrfs_ino(inode) ||
2862             found_type != BTRFS_EXTENT_DATA_KEY) {
2863                 /* have to trust i_size as the end */
2864                 last = (u64)-1;
2865                 last_for_get_extent = isize;
2866         } else {
2867                 /*
2868                  * remember the start of the last extent.  There are a
2869                  * bunch of different factors that go into the length of the
2870                  * extent, so its much less complex to remember where it started
2871                  */
2872                 last = found_key.offset;
2873                 last_for_get_extent = last + 1;
2874         }
2875         btrfs_free_path(path);
2876
2877         /*
2878          * we might have some extents allocated but more delalloc past those
2879          * extents.  so, we trust isize unless the start of the last extent is
2880          * beyond isize
2881          */
2882         if (last < isize) {
2883                 last = (u64)-1;
2884                 last_for_get_extent = isize;
2885         }
2886
2887         lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2888                          &cached_state, GFP_NOFS);
2889
2890         em = get_extent_skip_holes(inode, off, last_for_get_extent,
2891                                    get_extent);
2892         if (!em)
2893                 goto out;
2894         if (IS_ERR(em)) {
2895                 ret = PTR_ERR(em);
2896                 goto out;
2897         }
2898
2899         while (!end) {
2900                 u64 offset_in_extent;
2901
2902                 /* break if the extent we found is outside the range */
2903                 if (em->start >= max || extent_map_end(em) < off)
2904                         break;
2905
2906                 /*
2907                  * get_extent may return an extent that starts before our
2908                  * requested range.  We have to make sure the ranges
2909                  * we return to fiemap always move forward and don't
2910                  * overlap, so adjust the offsets here
2911                  */
2912                 em_start = max(em->start, off);
2913
2914                 /*
2915                  * record the offset from the start of the extent
2916                  * for adjusting the disk offset below
2917                  */
2918                 offset_in_extent = em_start - em->start;
2919                 em_end = extent_map_end(em);
2920                 em_len = em_end - em_start;
2921                 emflags = em->flags;
2922                 disko = 0;
2923                 flags = 0;
2924
2925                 /*
2926                  * bump off for our next call to get_extent
2927                  */
2928                 off = extent_map_end(em);
2929                 if (off >= max)
2930                         end = 1;
2931
2932                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2933                         end = 1;
2934                         flags |= FIEMAP_EXTENT_LAST;
2935                 } else if (em->block_start == EXTENT_MAP_INLINE) {
2936                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
2937                                   FIEMAP_EXTENT_NOT_ALIGNED);
2938                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2939                         flags |= (FIEMAP_EXTENT_DELALLOC |
2940                                   FIEMAP_EXTENT_UNKNOWN);
2941                 } else {
2942                         disko = em->block_start + offset_in_extent;
2943                 }
2944                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2945                         flags |= FIEMAP_EXTENT_ENCODED;
2946
2947                 free_extent_map(em);
2948                 em = NULL;
2949                 if ((em_start >= last) || em_len == (u64)-1 ||
2950                    (last == (u64)-1 && isize <= em_end)) {
2951                         flags |= FIEMAP_EXTENT_LAST;
2952                         end = 1;
2953                 }
2954
2955                 /* now scan forward to see if this is really the last extent. */
2956                 em = get_extent_skip_holes(inode, off, last_for_get_extent,
2957                                            get_extent);
2958                 if (IS_ERR(em)) {
2959                         ret = PTR_ERR(em);
2960                         goto out;
2961                 }
2962                 if (!em) {
2963                         flags |= FIEMAP_EXTENT_LAST;
2964                         end = 1;
2965                 }
2966                 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2967                                               em_len, flags);
2968                 if (ret)
2969                         goto out_free;
2970         }
2971 out_free:
2972         free_extent_map(em);
2973 out:
2974         unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
2975                              &cached_state, GFP_NOFS);
2976         return ret;
2977 }
2978
2979 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2980                                               unsigned long i)
2981 {
2982         struct page *p;
2983         struct address_space *mapping;
2984
2985         if (i == 0)
2986                 return eb->first_page;
2987         i += eb->start >> PAGE_CACHE_SHIFT;
2988         mapping = eb->first_page->mapping;
2989         if (!mapping)
2990                 return NULL;
2991
2992         /*
2993          * extent_buffer_page is only called after pinning the page
2994          * by increasing the reference count.  So we know the page must
2995          * be in the radix tree.
2996          */
2997         rcu_read_lock();
2998         p = radix_tree_lookup(&mapping->page_tree, i);
2999         rcu_read_unlock();
3000
3001         return p;
3002 }
3003
3004 static inline unsigned long num_extent_pages(u64 start, u64 len)
3005 {
3006         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3007                 (start >> PAGE_CACHE_SHIFT);
3008 }
3009
3010 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3011                                                    u64 start,
3012                                                    unsigned long len,
3013                                                    gfp_t mask)
3014 {
3015         struct extent_buffer *eb = NULL;
3016 #if LEAK_DEBUG
3017         unsigned long flags;
3018 #endif
3019
3020         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3021         if (eb == NULL)
3022                 return NULL;
3023         eb->start = start;
3024         eb->len = len;
3025         spin_lock_init(&eb->lock);
3026         init_waitqueue_head(&eb->lock_wq);
3027
3028 #if LEAK_DEBUG
3029         spin_lock_irqsave(&leak_lock, flags);
3030         list_add(&eb->leak_list, &buffers);
3031         spin_unlock_irqrestore(&leak_lock, flags);
3032 #endif
3033         atomic_set(&eb->refs, 1);
3034
3035         return eb;
3036 }
3037
3038 static void __free_extent_buffer(struct extent_buffer *eb)
3039 {
3040 #if LEAK_DEBUG
3041         unsigned long flags;
3042         spin_lock_irqsave(&leak_lock, flags);
3043         list_del(&eb->leak_list);
3044         spin_unlock_irqrestore(&leak_lock, flags);
3045 #endif
3046         kmem_cache_free(extent_buffer_cache, eb);
3047 }
3048
3049 /*
3050  * Helper for releasing extent buffer page.
3051  */
3052 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3053                                                 unsigned long start_idx)
3054 {
3055         unsigned long index;
3056         struct page *page;
3057
3058         if (!eb->first_page)
3059                 return;
3060
3061         index = num_extent_pages(eb->start, eb->len);
3062         if (start_idx >= index)
3063                 return;
3064
3065         do {
3066                 index--;
3067                 page = extent_buffer_page(eb, index);
3068                 if (page)
3069                         page_cache_release(page);
3070         } while (index != start_idx);
3071 }
3072
3073 /*
3074  * Helper for releasing the extent buffer.
3075  */
3076 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3077 {
3078         btrfs_release_extent_buffer_page(eb, 0);
3079         __free_extent_buffer(eb);
3080 }
3081
3082 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3083                                           u64 start, unsigned long len,
3084                                           struct page *page0)
3085 {
3086         unsigned long num_pages = num_extent_pages(start, len);
3087         unsigned long i;
3088         unsigned long index = start >> PAGE_CACHE_SHIFT;
3089         struct extent_buffer *eb;
3090         struct extent_buffer *exists = NULL;
3091         struct page *p;
3092         struct address_space *mapping = tree->mapping;
3093         int uptodate = 1;
3094         int ret;
3095
3096         rcu_read_lock();
3097         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3098         if (eb && atomic_inc_not_zero(&eb->refs)) {
3099                 rcu_read_unlock();
3100                 mark_page_accessed(eb->first_page);
3101                 return eb;
3102         }
3103         rcu_read_unlock();
3104
3105         eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3106         if (!eb)
3107                 return NULL;
3108
3109         if (page0) {
3110                 eb->first_page = page0;
3111                 i = 1;
3112                 index++;
3113                 page_cache_get(page0);
3114                 mark_page_accessed(page0);