Btrfs: Index extent buffers in an rbtree
[sfrench/cifs-2.6.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17
18 /* temporary define until extent_map moves out of btrfs */
19 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
20                                        unsigned long extra_flags,
21                                        void (*ctor)(void *, struct kmem_cache *,
22                                                     unsigned long));
23
24 static struct kmem_cache *extent_state_cache;
25 static struct kmem_cache *extent_buffer_cache;
26
27 static LIST_HEAD(buffers);
28 static LIST_HEAD(states);
29 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
30
31 #define BUFFER_LRU_MAX 64
32
33 struct tree_entry {
34         u64 start;
35         u64 end;
36         struct rb_node rb_node;
37 };
38
39 struct extent_page_data {
40         struct bio *bio;
41         struct extent_io_tree *tree;
42         get_extent_t *get_extent;
43 };
44
45 int __init extent_io_init(void)
46 {
47         extent_state_cache = btrfs_cache_create("extent_state",
48                                             sizeof(struct extent_state), 0,
49                                             NULL);
50         if (!extent_state_cache)
51                 return -ENOMEM;
52
53         extent_buffer_cache = btrfs_cache_create("extent_buffers",
54                                             sizeof(struct extent_buffer), 0,
55                                             NULL);
56         if (!extent_buffer_cache)
57                 goto free_state_cache;
58         return 0;
59
60 free_state_cache:
61         kmem_cache_destroy(extent_state_cache);
62         return -ENOMEM;
63 }
64
65 void extent_io_exit(void)
66 {
67         struct extent_state *state;
68         struct extent_buffer *eb;
69
70         while (!list_empty(&states)) {
71                 state = list_entry(states.next, struct extent_state, leak_list);
72                 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
73                 list_del(&state->leak_list);
74                 kmem_cache_free(extent_state_cache, state);
75
76         }
77
78         while (!list_empty(&buffers)) {
79                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
80                 printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
81                 list_del(&eb->leak_list);
82                 kmem_cache_free(extent_buffer_cache, eb);
83         }
84         if (extent_state_cache)
85                 kmem_cache_destroy(extent_state_cache);
86         if (extent_buffer_cache)
87                 kmem_cache_destroy(extent_buffer_cache);
88 }
89
90 void extent_io_tree_init(struct extent_io_tree *tree,
91                           struct address_space *mapping, gfp_t mask)
92 {
93         tree->state.rb_node = NULL;
94         tree->buffer.rb_node = NULL;
95         tree->ops = NULL;
96         tree->dirty_bytes = 0;
97         spin_lock_init(&tree->lock);
98         spin_lock_init(&tree->buffer_lock);
99         tree->mapping = mapping;
100         tree->last = NULL;
101 }
102 EXPORT_SYMBOL(extent_io_tree_init);
103
104 struct extent_state *alloc_extent_state(gfp_t mask)
105 {
106         struct extent_state *state;
107         unsigned long flags;
108
109         state = kmem_cache_alloc(extent_state_cache, mask);
110         if (!state)
111                 return state;
112         state->state = 0;
113         state->private = 0;
114         state->tree = NULL;
115         spin_lock_irqsave(&leak_lock, flags);
116         list_add(&state->leak_list, &states);
117         spin_unlock_irqrestore(&leak_lock, flags);
118
119         atomic_set(&state->refs, 1);
120         init_waitqueue_head(&state->wq);
121         return state;
122 }
123 EXPORT_SYMBOL(alloc_extent_state);
124
125 void free_extent_state(struct extent_state *state)
126 {
127         if (!state)
128                 return;
129         if (atomic_dec_and_test(&state->refs)) {
130                 unsigned long flags;
131                 WARN_ON(state->tree);
132                 spin_lock_irqsave(&leak_lock, flags);
133                 list_del(&state->leak_list);
134                 spin_unlock_irqrestore(&leak_lock, flags);
135                 kmem_cache_free(extent_state_cache, state);
136         }
137 }
138 EXPORT_SYMBOL(free_extent_state);
139
140 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
141                                    struct rb_node *node)
142 {
143         struct rb_node ** p = &root->rb_node;
144         struct rb_node * parent = NULL;
145         struct tree_entry *entry;
146
147         while(*p) {
148                 parent = *p;
149                 entry = rb_entry(parent, struct tree_entry, rb_node);
150
151                 if (offset < entry->start)
152                         p = &(*p)->rb_left;
153                 else if (offset > entry->end)
154                         p = &(*p)->rb_right;
155                 else
156                         return parent;
157         }
158
159         entry = rb_entry(node, struct tree_entry, rb_node);
160         rb_link_node(node, parent, p);
161         rb_insert_color(node, root);
162         return NULL;
163 }
164
165 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
166                                      struct rb_node **prev_ret,
167                                      struct rb_node **next_ret)
168 {
169         struct rb_root *root = &tree->state;
170         struct rb_node * n = root->rb_node;
171         struct rb_node *prev = NULL;
172         struct rb_node *orig_prev = NULL;
173         struct tree_entry *entry;
174         struct tree_entry *prev_entry = NULL;
175
176         if (tree->last) {
177                 struct extent_state *state;
178                 state = tree->last;
179                 if (state->start <= offset && offset <= state->end)
180                         return &tree->last->rb_node;
181         }
182         while(n) {
183                 entry = rb_entry(n, struct tree_entry, rb_node);
184                 prev = n;
185                 prev_entry = entry;
186
187                 if (offset < entry->start)
188                         n = n->rb_left;
189                 else if (offset > entry->end)
190                         n = n->rb_right;
191                 else {
192                         tree->last = rb_entry(n, struct extent_state, rb_node);
193                         return n;
194                 }
195         }
196
197         if (prev_ret) {
198                 orig_prev = prev;
199                 while(prev && offset > prev_entry->end) {
200                         prev = rb_next(prev);
201                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
202                 }
203                 *prev_ret = prev;
204                 prev = orig_prev;
205         }
206
207         if (next_ret) {
208                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
209                 while(prev && offset < prev_entry->start) {
210                         prev = rb_prev(prev);
211                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
212                 }
213                 *next_ret = prev;
214         }
215         return NULL;
216 }
217
218 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
219                                           u64 offset)
220 {
221         struct rb_node *prev = NULL;
222         struct rb_node *ret;
223
224         ret = __etree_search(tree, offset, &prev, NULL);
225         if (!ret) {
226                 if (prev) {
227                         tree->last = rb_entry(prev, struct extent_state,
228                                               rb_node);
229                 }
230                 return prev;
231         }
232         return ret;
233 }
234
235 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
236                                           u64 offset, struct rb_node *node)
237 {
238         struct rb_root *root = &tree->buffer;
239         struct rb_node ** p = &root->rb_node;
240         struct rb_node * parent = NULL;
241         struct extent_buffer *eb;
242
243         while(*p) {
244                 parent = *p;
245                 eb = rb_entry(parent, struct extent_buffer, rb_node);
246
247                 if (offset < eb->start)
248                         p = &(*p)->rb_left;
249                 else if (offset > eb->start)
250                         p = &(*p)->rb_right;
251                 else
252                         return eb;
253         }
254
255         rb_link_node(node, parent, p);
256         rb_insert_color(node, root);
257         return NULL;
258 }
259
260 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
261                                            u64 offset)
262 {
263         struct rb_root *root = &tree->buffer;
264         struct rb_node * n = root->rb_node;
265         struct extent_buffer *eb;
266
267         while(n) {
268                 eb = rb_entry(n, struct extent_buffer, rb_node);
269                 if (offset < eb->start)
270                         n = n->rb_left;
271                 else if (offset > eb->start)
272                         n = n->rb_right;
273                 else
274                         return eb;
275         }
276         return NULL;
277 }
278
279 /*
280  * utility function to look for merge candidates inside a given range.
281  * Any extents with matching state are merged together into a single
282  * extent in the tree.  Extents with EXTENT_IO in their state field
283  * are not merged because the end_io handlers need to be able to do
284  * operations on them without sleeping (or doing allocations/splits).
285  *
286  * This should be called with the tree lock held.
287  */
288 static int merge_state(struct extent_io_tree *tree,
289                        struct extent_state *state)
290 {
291         struct extent_state *other;
292         struct rb_node *other_node;
293
294         if (state->state & EXTENT_IOBITS)
295                 return 0;
296
297         other_node = rb_prev(&state->rb_node);
298         if (other_node) {
299                 other = rb_entry(other_node, struct extent_state, rb_node);
300                 if (other->end == state->start - 1 &&
301                     other->state == state->state) {
302                         state->start = other->start;
303                         other->tree = NULL;
304                         if (tree->last == other)
305                                 tree->last = state;
306                         rb_erase(&other->rb_node, &tree->state);
307                         free_extent_state(other);
308                 }
309         }
310         other_node = rb_next(&state->rb_node);
311         if (other_node) {
312                 other = rb_entry(other_node, struct extent_state, rb_node);
313                 if (other->start == state->end + 1 &&
314                     other->state == state->state) {
315                         other->start = state->start;
316                         state->tree = NULL;
317                         if (tree->last == state)
318                                 tree->last = other;
319                         rb_erase(&state->rb_node, &tree->state);
320                         free_extent_state(state);
321                 }
322         }
323         return 0;
324 }
325
326 static void set_state_cb(struct extent_io_tree *tree,
327                          struct extent_state *state,
328                          unsigned long bits)
329 {
330         if (tree->ops && tree->ops->set_bit_hook) {
331                 tree->ops->set_bit_hook(tree->mapping->host, state->start,
332                                         state->end, state->state, bits);
333         }
334 }
335
336 static void clear_state_cb(struct extent_io_tree *tree,
337                            struct extent_state *state,
338                            unsigned long bits)
339 {
340         if (tree->ops && tree->ops->set_bit_hook) {
341                 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
342                                           state->end, state->state, bits);
343         }
344 }
345
346 /*
347  * insert an extent_state struct into the tree.  'bits' are set on the
348  * struct before it is inserted.
349  *
350  * This may return -EEXIST if the extent is already there, in which case the
351  * state struct is freed.
352  *
353  * The tree lock is not taken internally.  This is a utility function and
354  * probably isn't what you want to call (see set/clear_extent_bit).
355  */
356 static int insert_state(struct extent_io_tree *tree,
357                         struct extent_state *state, u64 start, u64 end,
358                         int bits)
359 {
360         struct rb_node *node;
361
362         if (end < start) {
363                 printk("end < start %Lu %Lu\n", end, start);
364                 WARN_ON(1);
365         }
366         if (bits & EXTENT_DIRTY)
367                 tree->dirty_bytes += end - start + 1;
368         set_state_cb(tree, state, bits);
369         state->state |= bits;
370         state->start = start;
371         state->end = end;
372         node = tree_insert(&tree->state, end, &state->rb_node);
373         if (node) {
374                 struct extent_state *found;
375                 found = rb_entry(node, struct extent_state, rb_node);
376                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
377                 free_extent_state(state);
378                 return -EEXIST;
379         }
380         state->tree = tree;
381         tree->last = state;
382         merge_state(tree, state);
383         return 0;
384 }
385
386 /*
387  * split a given extent state struct in two, inserting the preallocated
388  * struct 'prealloc' as the newly created second half.  'split' indicates an
389  * offset inside 'orig' where it should be split.
390  *
391  * Before calling,
392  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
393  * are two extent state structs in the tree:
394  * prealloc: [orig->start, split - 1]
395  * orig: [ split, orig->end ]
396  *
397  * The tree locks are not taken by this function. They need to be held
398  * by the caller.
399  */
400 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
401                        struct extent_state *prealloc, u64 split)
402 {
403         struct rb_node *node;
404         prealloc->start = orig->start;
405         prealloc->end = split - 1;
406         prealloc->state = orig->state;
407         orig->start = split;
408
409         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
410         if (node) {
411                 struct extent_state *found;
412                 found = rb_entry(node, struct extent_state, rb_node);
413                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
414                 free_extent_state(prealloc);
415                 return -EEXIST;
416         }
417         prealloc->tree = tree;
418         return 0;
419 }
420
421 /*
422  * utility function to clear some bits in an extent state struct.
423  * it will optionally wake up any one waiting on this state (wake == 1), or
424  * forcibly remove the state from the tree (delete == 1).
425  *
426  * If no bits are set on the state struct after clearing things, the
427  * struct is freed and removed from the tree
428  */
429 static int clear_state_bit(struct extent_io_tree *tree,
430                             struct extent_state *state, int bits, int wake,
431                             int delete)
432 {
433         int ret = state->state & bits;
434
435         if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
436                 u64 range = state->end - state->start + 1;
437                 WARN_ON(range > tree->dirty_bytes);
438                 tree->dirty_bytes -= range;
439         }
440         clear_state_cb(tree, state, bits);
441         state->state &= ~bits;
442         if (wake)
443                 wake_up(&state->wq);
444         if (delete || state->state == 0) {
445                 if (state->tree) {
446                         clear_state_cb(tree, state, state->state);
447                         if (tree->last == state) {
448                                 tree->last = extent_state_next(state);
449                         }
450                         rb_erase(&state->rb_node, &tree->state);
451                         state->tree = NULL;
452                         free_extent_state(state);
453                 } else {
454                         WARN_ON(1);
455                 }
456         } else {
457                 merge_state(tree, state);
458         }
459         return ret;
460 }
461
462 /*
463  * clear some bits on a range in the tree.  This may require splitting
464  * or inserting elements in the tree, so the gfp mask is used to
465  * indicate which allocations or sleeping are allowed.
466  *
467  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
468  * the given range from the tree regardless of state (ie for truncate).
469  *
470  * the range [start, end] is inclusive.
471  *
472  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
473  * bits were already set, or zero if none of the bits were already set.
474  */
475 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
476                      int bits, int wake, int delete, gfp_t mask)
477 {
478         struct extent_state *state;
479         struct extent_state *prealloc = NULL;
480         struct rb_node *node;
481         unsigned long flags;
482         int err;
483         int set = 0;
484
485 again:
486         if (!prealloc && (mask & __GFP_WAIT)) {
487                 prealloc = alloc_extent_state(mask);
488                 if (!prealloc)
489                         return -ENOMEM;
490         }
491
492         spin_lock_irqsave(&tree->lock, flags);
493         /*
494          * this search will find the extents that end after
495          * our range starts
496          */
497         node = tree_search(tree, start);
498         if (!node)
499                 goto out;
500         state = rb_entry(node, struct extent_state, rb_node);
501         if (state->start > end)
502                 goto out;
503         WARN_ON(state->end < start);
504
505         /*
506          *     | ---- desired range ---- |
507          *  | state | or
508          *  | ------------- state -------------- |
509          *
510          * We need to split the extent we found, and may flip
511          * bits on second half.
512          *
513          * If the extent we found extends past our range, we
514          * just split and search again.  It'll get split again
515          * the next time though.
516          *
517          * If the extent we found is inside our range, we clear
518          * the desired bit on it.
519          */
520
521         if (state->start < start) {
522                 if (!prealloc)
523                         prealloc = alloc_extent_state(GFP_ATOMIC);
524                 err = split_state(tree, state, prealloc, start);
525                 BUG_ON(err == -EEXIST);
526                 prealloc = NULL;
527                 if (err)
528                         goto out;
529                 if (state->end <= end) {
530                         start = state->end + 1;
531                         set |= clear_state_bit(tree, state, bits,
532                                         wake, delete);
533                 } else {
534                         start = state->start;
535                 }
536                 goto search_again;
537         }
538         /*
539          * | ---- desired range ---- |
540          *                        | state |
541          * We need to split the extent, and clear the bit
542          * on the first half
543          */
544         if (state->start <= end && state->end > end) {
545                 if (!prealloc)
546                         prealloc = alloc_extent_state(GFP_ATOMIC);
547                 err = split_state(tree, state, prealloc, end + 1);
548                 BUG_ON(err == -EEXIST);
549
550                 if (wake)
551                         wake_up(&state->wq);
552                 set |= clear_state_bit(tree, prealloc, bits,
553                                        wake, delete);
554                 prealloc = NULL;
555                 goto out;
556         }
557
558         start = state->end + 1;
559         set |= clear_state_bit(tree, state, bits, wake, delete);
560         goto search_again;
561
562 out:
563         spin_unlock_irqrestore(&tree->lock, flags);
564         if (prealloc)
565                 free_extent_state(prealloc);
566
567         return set;
568
569 search_again:
570         if (start > end)
571                 goto out;
572         spin_unlock_irqrestore(&tree->lock, flags);
573         if (mask & __GFP_WAIT)
574                 cond_resched();
575         goto again;
576 }
577 EXPORT_SYMBOL(clear_extent_bit);
578
579 static int wait_on_state(struct extent_io_tree *tree,
580                          struct extent_state *state)
581 {
582         DEFINE_WAIT(wait);
583         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
584         spin_unlock_irq(&tree->lock);
585         schedule();
586         spin_lock_irq(&tree->lock);
587         finish_wait(&state->wq, &wait);
588         return 0;
589 }
590
591 /*
592  * waits for one or more bits to clear on a range in the state tree.
593  * The range [start, end] is inclusive.
594  * The tree lock is taken by this function
595  */
596 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
597 {
598         struct extent_state *state;
599         struct rb_node *node;
600
601         spin_lock_irq(&tree->lock);
602 again:
603         while (1) {
604                 /*
605                  * this search will find all the extents that end after
606                  * our range starts
607                  */
608                 node = tree_search(tree, start);
609                 if (!node)
610                         break;
611
612                 state = rb_entry(node, struct extent_state, rb_node);
613
614                 if (state->start > end)
615                         goto out;
616
617                 if (state->state & bits) {
618                         start = state->start;
619                         atomic_inc(&state->refs);
620                         wait_on_state(tree, state);
621                         free_extent_state(state);
622                         goto again;
623                 }
624                 start = state->end + 1;
625
626                 if (start > end)
627                         break;
628
629                 if (need_resched()) {
630                         spin_unlock_irq(&tree->lock);
631                         cond_resched();
632                         spin_lock_irq(&tree->lock);
633                 }
634         }
635 out:
636         spin_unlock_irq(&tree->lock);
637         return 0;
638 }
639 EXPORT_SYMBOL(wait_extent_bit);
640
641 static void set_state_bits(struct extent_io_tree *tree,
642                            struct extent_state *state,
643                            int bits)
644 {
645         if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
646                 u64 range = state->end - state->start + 1;
647                 tree->dirty_bytes += range;
648         }
649         set_state_cb(tree, state, bits);
650         state->state |= bits;
651 }
652
653 /*
654  * set some bits on a range in the tree.  This may require allocations
655  * or sleeping, so the gfp mask is used to indicate what is allowed.
656  *
657  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
658  * range already has the desired bits set.  The start of the existing
659  * range is returned in failed_start in this case.
660  *
661  * [start, end] is inclusive
662  * This takes the tree lock.
663  */
664 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
665                    int exclusive, u64 *failed_start, gfp_t mask)
666 {
667         struct extent_state *state;
668         struct extent_state *prealloc = NULL;
669         struct rb_node *node;
670         unsigned long flags;
671         int err = 0;
672         int set;
673         u64 last_start;
674         u64 last_end;
675 again:
676         if (!prealloc && (mask & __GFP_WAIT)) {
677                 prealloc = alloc_extent_state(mask);
678                 if (!prealloc)
679                         return -ENOMEM;
680         }
681
682         spin_lock_irqsave(&tree->lock, flags);
683         /*
684          * this search will find all the extents that end after
685          * our range starts.
686          */
687         node = tree_search(tree, start);
688         if (!node) {
689                 err = insert_state(tree, prealloc, start, end, bits);
690                 prealloc = NULL;
691                 BUG_ON(err == -EEXIST);
692                 goto out;
693         }
694
695         state = rb_entry(node, struct extent_state, rb_node);
696         last_start = state->start;
697         last_end = state->end;
698
699         /*
700          * | ---- desired range ---- |
701          * | state |
702          *
703          * Just lock what we found and keep going
704          */
705         if (state->start == start && state->end <= end) {
706                 set = state->state & bits;
707                 if (set && exclusive) {
708                         *failed_start = state->start;
709                         err = -EEXIST;
710                         goto out;
711                 }
712                 set_state_bits(tree, state, bits);
713                 start = state->end + 1;
714                 merge_state(tree, state);
715                 goto search_again;
716         }
717
718         /*
719          *     | ---- desired range ---- |
720          * | state |
721          *   or
722          * | ------------- state -------------- |
723          *
724          * We need to split the extent we found, and may flip bits on
725          * second half.
726          *
727          * If the extent we found extends past our
728          * range, we just split and search again.  It'll get split
729          * again the next time though.
730          *
731          * If the extent we found is inside our range, we set the
732          * desired bit on it.
733          */
734         if (state->start < start) {
735                 set = state->state & bits;
736                 if (exclusive && set) {
737                         *failed_start = start;
738                         err = -EEXIST;
739                         goto out;
740                 }
741                 err = split_state(tree, state, prealloc, start);
742                 BUG_ON(err == -EEXIST);
743                 prealloc = NULL;
744                 if (err)
745                         goto out;
746                 if (state->end <= end) {
747                         set_state_bits(tree, state, bits);
748                         start = state->end + 1;
749                         merge_state(tree, state);
750                 } else {
751                         start = state->start;
752                 }
753                 goto search_again;
754         }
755         /*
756          * | ---- desired range ---- |
757          *     | state | or               | state |
758          *
759          * There's a hole, we need to insert something in it and
760          * ignore the extent we found.
761          */
762         if (state->start > start) {
763                 u64 this_end;
764                 if (end < last_start)
765                         this_end = end;
766                 else
767                         this_end = last_start -1;
768                 err = insert_state(tree, prealloc, start, this_end,
769                                    bits);
770                 prealloc = NULL;
771                 BUG_ON(err == -EEXIST);
772                 if (err)
773                         goto out;
774                 start = this_end + 1;
775                 goto search_again;
776         }
777         /*
778          * | ---- desired range ---- |
779          *                        | state |
780          * We need to split the extent, and set the bit
781          * on the first half
782          */
783         if (state->start <= end && state->end > end) {
784                 set = state->state & bits;
785                 if (exclusive && set) {
786                         *failed_start = start;
787                         err = -EEXIST;
788                         goto out;
789                 }
790                 err = split_state(tree, state, prealloc, end + 1);
791                 BUG_ON(err == -EEXIST);
792
793                 set_state_bits(tree, prealloc, bits);
794                 merge_state(tree, prealloc);
795                 prealloc = NULL;
796                 goto out;
797         }
798
799         goto search_again;
800
801 out:
802         spin_unlock_irqrestore(&tree->lock, flags);
803         if (prealloc)
804                 free_extent_state(prealloc);
805
806         return err;
807
808 search_again:
809         if (start > end)
810                 goto out;
811         spin_unlock_irqrestore(&tree->lock, flags);
812         if (mask & __GFP_WAIT)
813                 cond_resched();
814         goto again;
815 }
816 EXPORT_SYMBOL(set_extent_bit);
817
818 /* wrappers around set/clear extent bit */
819 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
820                      gfp_t mask)
821 {
822         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
823                               mask);
824 }
825 EXPORT_SYMBOL(set_extent_dirty);
826
827 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
828                        gfp_t mask)
829 {
830         return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
831 }
832 EXPORT_SYMBOL(set_extent_ordered);
833
834 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
835                     int bits, gfp_t mask)
836 {
837         return set_extent_bit(tree, start, end, bits, 0, NULL,
838                               mask);
839 }
840 EXPORT_SYMBOL(set_extent_bits);
841
842 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
843                       int bits, gfp_t mask)
844 {
845         return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
846 }
847 EXPORT_SYMBOL(clear_extent_bits);
848
849 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
850                      gfp_t mask)
851 {
852         return set_extent_bit(tree, start, end,
853                               EXTENT_DELALLOC | EXTENT_DIRTY,
854                               0, NULL, mask);
855 }
856 EXPORT_SYMBOL(set_extent_delalloc);
857
858 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
859                        gfp_t mask)
860 {
861         return clear_extent_bit(tree, start, end,
862                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
863 }
864 EXPORT_SYMBOL(clear_extent_dirty);
865
866 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
867                          gfp_t mask)
868 {
869         return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
870 }
871 EXPORT_SYMBOL(clear_extent_ordered);
872
873 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
874                      gfp_t mask)
875 {
876         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
877                               mask);
878 }
879 EXPORT_SYMBOL(set_extent_new);
880
881 int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
882                        gfp_t mask)
883 {
884         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
885 }
886 EXPORT_SYMBOL(clear_extent_new);
887
888 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
889                         gfp_t mask)
890 {
891         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
892                               mask);
893 }
894 EXPORT_SYMBOL(set_extent_uptodate);
895
896 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
897                           gfp_t mask)
898 {
899         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
900 }
901 EXPORT_SYMBOL(clear_extent_uptodate);
902
903 int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
904                          gfp_t mask)
905 {
906         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
907                               0, NULL, mask);
908 }
909 EXPORT_SYMBOL(set_extent_writeback);
910
911 int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
912                            gfp_t mask)
913 {
914         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
915 }
916 EXPORT_SYMBOL(clear_extent_writeback);
917
918 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
919 {
920         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
921 }
922 EXPORT_SYMBOL(wait_on_extent_writeback);
923
924 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
925 {
926         int err;
927         u64 failed_start;
928         while (1) {
929                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
930                                      &failed_start, mask);
931                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
932                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
933                         start = failed_start;
934                 } else {
935                         break;
936                 }
937                 WARN_ON(start > end);
938         }
939         return err;
940 }
941 EXPORT_SYMBOL(lock_extent);
942
943 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
944                   gfp_t mask)
945 {
946         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
947 }
948 EXPORT_SYMBOL(unlock_extent);
949
950 /*
951  * helper function to set pages and extents in the tree dirty
952  */
953 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
954 {
955         unsigned long index = start >> PAGE_CACHE_SHIFT;
956         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
957         struct page *page;
958
959         while (index <= end_index) {
960                 page = find_get_page(tree->mapping, index);
961                 BUG_ON(!page);
962                 __set_page_dirty_nobuffers(page);
963                 page_cache_release(page);
964                 index++;
965         }
966         set_extent_dirty(tree, start, end, GFP_NOFS);
967         return 0;
968 }
969 EXPORT_SYMBOL(set_range_dirty);
970
971 /*
972  * helper function to set both pages and extents in the tree writeback
973  */
974 int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
975 {
976         unsigned long index = start >> PAGE_CACHE_SHIFT;
977         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
978         struct page *page;
979
980         while (index <= end_index) {
981                 page = find_get_page(tree->mapping, index);
982                 BUG_ON(!page);
983                 set_page_writeback(page);
984                 page_cache_release(page);
985                 index++;
986         }
987         set_extent_writeback(tree, start, end, GFP_NOFS);
988         return 0;
989 }
990 EXPORT_SYMBOL(set_range_writeback);
991
992 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
993                           u64 *start_ret, u64 *end_ret, int bits)
994 {
995         struct rb_node *node;
996         struct extent_state *state;
997         int ret = 1;
998
999         spin_lock_irq(&tree->lock);
1000         /*
1001          * this search will find all the extents that end after
1002          * our range starts.
1003          */
1004         node = tree_search(tree, start);
1005         if (!node) {
1006                 goto out;
1007         }
1008
1009         while(1) {
1010                 state = rb_entry(node, struct extent_state, rb_node);
1011                 if (state->end >= start && (state->state & bits)) {
1012                         *start_ret = state->start;
1013                         *end_ret = state->end;
1014                         ret = 0;
1015                         break;
1016                 }
1017                 node = rb_next(node);
1018                 if (!node)
1019                         break;
1020         }
1021 out:
1022         spin_unlock_irq(&tree->lock);
1023         return ret;
1024 }
1025 EXPORT_SYMBOL(find_first_extent_bit);
1026
1027 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1028                                                  u64 start, int bits)
1029 {
1030         struct rb_node *node;
1031         struct extent_state *state;
1032
1033         /*
1034          * this search will find all the extents that end after
1035          * our range starts.
1036          */
1037         node = tree_search(tree, start);
1038         if (!node) {
1039                 goto out;
1040         }
1041
1042         while(1) {
1043                 state = rb_entry(node, struct extent_state, rb_node);
1044                 if (state->end >= start && (state->state & bits)) {
1045                         return state;
1046                 }
1047                 node = rb_next(node);
1048                 if (!node)
1049                         break;
1050         }
1051 out:
1052         return NULL;
1053 }
1054 EXPORT_SYMBOL(find_first_extent_bit_state);
1055
1056 u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1057                              u64 *start, u64 *end, u64 max_bytes)
1058 {
1059         struct rb_node *node;
1060         struct extent_state *state;
1061         u64 cur_start = *start;
1062         u64 found = 0;
1063         u64 total_bytes = 0;
1064
1065         spin_lock_irq(&tree->lock);
1066         /*
1067          * this search will find all the extents that end after
1068          * our range starts.
1069          */
1070 search_again:
1071         node = tree_search(tree, cur_start);
1072         if (!node) {
1073                 if (!found)
1074                         *end = (u64)-1;
1075                 goto out;
1076         }
1077
1078         while(1) {
1079                 state = rb_entry(node, struct extent_state, rb_node);
1080                 if (found && state->start != cur_start) {
1081                         goto out;
1082                 }
1083                 if (!(state->state & EXTENT_DELALLOC)) {
1084                         if (!found)
1085                                 *end = state->end;
1086                         goto out;
1087                 }
1088                 if (!found) {
1089                         struct extent_state *prev_state;
1090                         struct rb_node *prev_node = node;
1091                         while(1) {
1092                                 prev_node = rb_prev(prev_node);
1093                                 if (!prev_node)
1094                                         break;
1095                                 prev_state = rb_entry(prev_node,
1096                                                       struct extent_state,
1097                                                       rb_node);
1098                                 if (!(prev_state->state & EXTENT_DELALLOC))
1099                                         break;
1100                                 state = prev_state;
1101                                 node = prev_node;
1102                         }
1103                 }
1104                 if (state->state & EXTENT_LOCKED) {
1105                         DEFINE_WAIT(wait);
1106                         atomic_inc(&state->refs);
1107                         prepare_to_wait(&state->wq, &wait,
1108                                         TASK_UNINTERRUPTIBLE);
1109                         spin_unlock_irq(&tree->lock);
1110                         schedule();
1111                         spin_lock_irq(&tree->lock);
1112                         finish_wait(&state->wq, &wait);
1113                         free_extent_state(state);
1114                         goto search_again;
1115                 }
1116                 set_state_cb(tree, state, EXTENT_LOCKED);
1117                 state->state |= EXTENT_LOCKED;
1118                 if (!found)
1119                         *start = state->start;
1120                 found++;
1121                 *end = state->end;
1122                 cur_start = state->end + 1;
1123                 node = rb_next(node);
1124                 if (!node)
1125                         break;
1126                 total_bytes += state->end - state->start + 1;
1127                 if (total_bytes >= max_bytes)
1128                         break;
1129         }
1130 out:
1131         spin_unlock_irq(&tree->lock);
1132         return found;
1133 }
1134
1135 u64 count_range_bits(struct extent_io_tree *tree,
1136                      u64 *start, u64 search_end, u64 max_bytes,
1137                      unsigned long bits)
1138 {
1139         struct rb_node *node;
1140         struct extent_state *state;
1141         u64 cur_start = *start;
1142         u64 total_bytes = 0;
1143         int found = 0;
1144
1145         if (search_end <= cur_start) {
1146                 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1147                 WARN_ON(1);
1148                 return 0;
1149         }
1150
1151         spin_lock_irq(&tree->lock);
1152         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1153                 total_bytes = tree->dirty_bytes;
1154                 goto out;
1155         }
1156         /*
1157          * this search will find all the extents that end after
1158          * our range starts.
1159          */
1160         node = tree_search(tree, cur_start);
1161         if (!node) {
1162                 goto out;
1163         }
1164
1165         while(1) {
1166                 state = rb_entry(node, struct extent_state, rb_node);
1167                 if (state->start > search_end)
1168                         break;
1169                 if (state->end >= cur_start && (state->state & bits)) {
1170                         total_bytes += min(search_end, state->end) + 1 -
1171                                        max(cur_start, state->start);
1172                         if (total_bytes >= max_bytes)
1173                                 break;
1174                         if (!found) {
1175                                 *start = state->start;
1176                                 found = 1;
1177                         }
1178                 }
1179                 node = rb_next(node);
1180                 if (!node)
1181                         break;
1182         }
1183 out:
1184         spin_unlock_irq(&tree->lock);
1185         return total_bytes;
1186 }
1187 /*
1188  * helper function to lock both pages and extents in the tree.
1189  * pages must be locked first.
1190  */
1191 int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1192 {
1193         unsigned long index = start >> PAGE_CACHE_SHIFT;
1194         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1195         struct page *page;
1196         int err;
1197
1198         while (index <= end_index) {
1199                 page = grab_cache_page(tree->mapping, index);
1200                 if (!page) {
1201                         err = -ENOMEM;
1202                         goto failed;
1203                 }
1204                 if (IS_ERR(page)) {
1205                         err = PTR_ERR(page);
1206                         goto failed;
1207                 }
1208                 index++;
1209         }
1210         lock_extent(tree, start, end, GFP_NOFS);
1211         return 0;
1212
1213 failed:
1214         /*
1215          * we failed above in getting the page at 'index', so we undo here
1216          * up to but not including the page at 'index'
1217          */
1218         end_index = index;
1219         index = start >> PAGE_CACHE_SHIFT;
1220         while (index < end_index) {
1221                 page = find_get_page(tree->mapping, index);
1222                 unlock_page(page);
1223                 page_cache_release(page);
1224                 index++;
1225         }
1226         return err;
1227 }
1228 EXPORT_SYMBOL(lock_range);
1229
1230 /*
1231  * helper function to unlock both pages and extents in the tree.
1232  */
1233 int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1234 {
1235         unsigned long index = start >> PAGE_CACHE_SHIFT;
1236         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1237         struct page *page;
1238
1239         while (index <= end_index) {
1240                 page = find_get_page(tree->mapping, index);
1241                 unlock_page(page);
1242                 page_cache_release(page);
1243                 index++;
1244         }
1245         unlock_extent(tree, start, end, GFP_NOFS);
1246         return 0;
1247 }
1248 EXPORT_SYMBOL(unlock_range);
1249
1250 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1251 {
1252         struct rb_node *node;
1253         struct extent_state *state;
1254         int ret = 0;
1255
1256         spin_lock_irq(&tree->lock);
1257         /*
1258          * this search will find all the extents that end after
1259          * our range starts.
1260          */
1261         node = tree_search(tree, start);
1262         if (!node) {
1263                 ret = -ENOENT;
1264                 goto out;
1265         }
1266         state = rb_entry(node, struct extent_state, rb_node);
1267         if (state->start != start) {
1268                 ret = -ENOENT;
1269                 goto out;
1270         }
1271         state->private = private;
1272 out:
1273         spin_unlock_irq(&tree->lock);
1274         return ret;
1275 }
1276
1277 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1278 {
1279         struct rb_node *node;
1280         struct extent_state *state;
1281         int ret = 0;
1282
1283         spin_lock_irq(&tree->lock);
1284         /*
1285          * this search will find all the extents that end after
1286          * our range starts.
1287          */
1288         node = tree_search(tree, start);
1289         if (!node) {
1290                 ret = -ENOENT;
1291                 goto out;
1292         }
1293         state = rb_entry(node, struct extent_state, rb_node);
1294         if (state->start != start) {
1295                 ret = -ENOENT;
1296                 goto out;
1297         }
1298         *private = state->private;
1299 out:
1300         spin_unlock_irq(&tree->lock);
1301         return ret;
1302 }
1303
1304 /*
1305  * searches a range in the state tree for a given mask.
1306  * If 'filled' == 1, this returns 1 only if every extent in the tree
1307  * has the bits set.  Otherwise, 1 is returned if any bit in the
1308  * range is found set.
1309  */
1310 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1311                    int bits, int filled)
1312 {
1313         struct extent_state *state = NULL;
1314         struct rb_node *node;
1315         int bitset = 0;
1316         unsigned long flags;
1317
1318         spin_lock_irqsave(&tree->lock, flags);
1319         node = tree_search(tree, start);
1320         while (node && start <= end) {
1321                 state = rb_entry(node, struct extent_state, rb_node);
1322
1323                 if (filled && state->start > start) {
1324                         bitset = 0;
1325                         break;
1326                 }
1327
1328                 if (state->start > end)
1329                         break;
1330
1331                 if (state->state & bits) {
1332                         bitset = 1;
1333                         if (!filled)
1334                                 break;
1335                 } else if (filled) {
1336                         bitset = 0;
1337                         break;
1338                 }
1339                 start = state->end + 1;
1340                 if (start > end)
1341                         break;
1342                 node = rb_next(node);
1343                 if (!node) {
1344                         if (filled)
1345                                 bitset = 0;
1346                         break;
1347                 }
1348         }
1349         spin_unlock_irqrestore(&tree->lock, flags);
1350         return bitset;
1351 }
1352 EXPORT_SYMBOL(test_range_bit);
1353
1354 /*
1355  * helper function to set a given page up to date if all the
1356  * extents in the tree for that page are up to date
1357  */
1358 static int check_page_uptodate(struct extent_io_tree *tree,
1359                                struct page *page)
1360 {
1361         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1362         u64 end = start + PAGE_CACHE_SIZE - 1;
1363         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1364                 SetPageUptodate(page);
1365         return 0;
1366 }
1367
1368 /*
1369  * helper function to unlock a page if all the extents in the tree
1370  * for that page are unlocked
1371  */
1372 static int check_page_locked(struct extent_io_tree *tree,
1373                              struct page *page)
1374 {
1375         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1376         u64 end = start + PAGE_CACHE_SIZE - 1;
1377         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1378                 unlock_page(page);
1379         return 0;
1380 }
1381
1382 /*
1383  * helper function to end page writeback if all the extents
1384  * in the tree for that page are done with writeback
1385  */
1386 static int check_page_writeback(struct extent_io_tree *tree,
1387                              struct page *page)
1388 {
1389         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1390         u64 end = start + PAGE_CACHE_SIZE - 1;
1391         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1392                 end_page_writeback(page);
1393         return 0;
1394 }
1395
1396 /* lots and lots of room for performance fixes in the end_bio funcs */
1397
1398 /*
1399  * after a writepage IO is done, we need to:
1400  * clear the uptodate bits on error
1401  * clear the writeback bits in the extent tree for this IO
1402  * end_page_writeback if the page has no more pending IO
1403  *
1404  * Scheduling is not allowed, so the extent state tree is expected
1405  * to have one and only one object corresponding to this IO.
1406  */
1407 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1408 static void end_bio_extent_writepage(struct bio *bio, int err)
1409 #else
1410 static int end_bio_extent_writepage(struct bio *bio,
1411                                    unsigned int bytes_done, int err)
1412 #endif
1413 {
1414         int uptodate = err == 0;
1415         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1416         struct extent_state *state = bio->bi_private;
1417         struct extent_io_tree *tree = state->tree;
1418         struct rb_node *node;
1419         u64 start;
1420         u64 end;
1421         u64 cur;
1422         int whole_page;
1423         int ret;
1424         unsigned long flags;
1425
1426 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1427         if (bio->bi_size)
1428                 return 1;
1429 #endif
1430         do {
1431                 struct page *page = bvec->bv_page;
1432                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1433                          bvec->bv_offset;
1434                 end = start + bvec->bv_len - 1;
1435
1436                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1437                         whole_page = 1;
1438                 else
1439                         whole_page = 0;
1440
1441                 if (--bvec >= bio->bi_io_vec)
1442                         prefetchw(&bvec->bv_page->flags);
1443                 if (tree->ops && tree->ops->writepage_end_io_hook) {
1444                         ret = tree->ops->writepage_end_io_hook(page, start,
1445                                                        end, state, uptodate);
1446                         if (ret)
1447                                 uptodate = 0;
1448                 }
1449
1450                 if (!uptodate && tree->ops &&
1451                     tree->ops->writepage_io_failed_hook) {
1452                         ret = tree->ops->writepage_io_failed_hook(bio, page,
1453                                                          start, end, state);
1454                         if (ret == 0) {
1455                                 state = NULL;
1456                                 uptodate = (err == 0);
1457                                 continue;
1458                         }
1459                 }
1460
1461                 if (!uptodate) {
1462                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1463                         ClearPageUptodate(page);
1464                         SetPageError(page);
1465                 }
1466
1467                 /*
1468                  * bios can get merged in funny ways, and so we need to
1469                  * be careful with the state variable.  We know the
1470                  * state won't be merged with others because it has
1471                  * WRITEBACK set, but we can't be sure each biovec is
1472                  * sequential in the file.  So, if our cached state
1473                  * doesn't match the expected end, search the tree
1474                  * for the correct one.
1475                  */
1476
1477                 spin_lock_irqsave(&tree->lock, flags);
1478                 if (!state || state->end != end) {
1479                         state = NULL;
1480                         node = __etree_search(tree, start, NULL, NULL);
1481                         if (node) {
1482                                 state = rb_entry(node, struct extent_state,
1483                                                  rb_node);
1484                                 if (state->end != end ||
1485                                     !(state->state & EXTENT_WRITEBACK))
1486                                         state = NULL;
1487                         }
1488                         if (!state) {
1489                                 spin_unlock_irqrestore(&tree->lock, flags);
1490                                 clear_extent_writeback(tree, start,
1491                                                        end, GFP_ATOMIC);
1492                                 goto next_io;
1493                         }
1494                 }
1495                 cur = end;
1496                 while(1) {
1497                         struct extent_state *clear = state;
1498                         cur = state->start;
1499                         node = rb_prev(&state->rb_node);
1500                         if (node) {
1501                                 state = rb_entry(node,
1502                                                  struct extent_state,
1503                                                  rb_node);
1504                         } else {
1505                                 state = NULL;
1506                         }
1507
1508                         clear_state_bit(tree, clear, EXTENT_WRITEBACK,
1509                                         1, 0);
1510                         if (cur == start)
1511                                 break;
1512                         if (cur < start) {
1513                                 WARN_ON(1);
1514                                 break;
1515                         }
1516                         if (!node)
1517                                 break;
1518                 }
1519                 /* before releasing the lock, make sure the next state
1520                  * variable has the expected bits set and corresponds
1521                  * to the correct offsets in the file
1522                  */
1523                 if (state && (state->end + 1 != start ||
1524                     !(state->state & EXTENT_WRITEBACK))) {
1525                         state = NULL;
1526                 }
1527                 spin_unlock_irqrestore(&tree->lock, flags);
1528 next_io:
1529
1530                 if (whole_page)
1531                         end_page_writeback(page);
1532                 else
1533                         check_page_writeback(tree, page);
1534         } while (bvec >= bio->bi_io_vec);
1535         bio_put(bio);
1536 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1537         return 0;
1538 #endif
1539 }
1540
1541 /*
1542  * after a readpage IO is done, we need to:
1543  * clear the uptodate bits on error
1544  * set the uptodate bits if things worked
1545  * set the page up to date if all extents in the tree are uptodate
1546  * clear the lock bit in the extent tree
1547  * unlock the page if there are no other extents locked for it
1548  *
1549  * Scheduling is not allowed, so the extent state tree is expected
1550  * to have one and only one object corresponding to this IO.
1551  */
1552 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1553 static void end_bio_extent_readpage(struct bio *bio, int err)
1554 #else
1555 static int end_bio_extent_readpage(struct bio *bio,
1556                                    unsigned int bytes_done, int err)
1557 #endif
1558 {
1559         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1560         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1561         struct extent_state *state = bio->bi_private;
1562         struct extent_io_tree *tree = state->tree;
1563         struct rb_node *node;
1564         u64 start;
1565         u64 end;
1566         u64 cur;
1567         unsigned long flags;
1568         int whole_page;
1569         int ret;
1570
1571 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1572         if (bio->bi_size)
1573                 return 1;
1574 #endif
1575
1576         do {
1577                 struct page *page = bvec->bv_page;
1578                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1579                         bvec->bv_offset;
1580                 end = start + bvec->bv_len - 1;
1581
1582                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1583                         whole_page = 1;
1584                 else
1585                         whole_page = 0;
1586
1587                 if (--bvec >= bio->bi_io_vec)
1588                         prefetchw(&bvec->bv_page->flags);
1589
1590                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1591                         ret = tree->ops->readpage_end_io_hook(page, start, end,
1592                                                               state);
1593                         if (ret)
1594                                 uptodate = 0;
1595                 }
1596                 if (!uptodate && tree->ops &&
1597                     tree->ops->readpage_io_failed_hook) {
1598                         ret = tree->ops->readpage_io_failed_hook(bio, page,
1599                                                          start, end, state);
1600                         if (ret == 0) {
1601                                 state = NULL;
1602                                 uptodate =
1603                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
1604                                 continue;
1605                         }
1606                 }
1607
1608                 spin_lock_irqsave(&tree->lock, flags);
1609                 if (!state || state->end != end) {
1610                         state = NULL;
1611                         node = __etree_search(tree, start, NULL, NULL);
1612                         if (node) {
1613                                 state = rb_entry(node, struct extent_state,
1614                                                  rb_node);
1615                                 if (state->end != end ||
1616                                     !(state->state & EXTENT_LOCKED))
1617                                         state = NULL;
1618                         }
1619                         if (!state) {
1620                                 spin_unlock_irqrestore(&tree->lock, flags);
1621                                 if (uptodate)
1622                                         set_extent_uptodate(tree, start, end,
1623                                                             GFP_ATOMIC);
1624                                 unlock_extent(tree, start, end, GFP_ATOMIC);
1625                                 goto next_io;
1626                         }
1627                 }
1628
1629                 cur = end;
1630                 while(1) {
1631                         struct extent_state *clear = state;
1632                         cur = state->start;
1633                         node = rb_prev(&state->rb_node);
1634                         if (node) {
1635                                 state = rb_entry(node,
1636                                          struct extent_state,
1637                                          rb_node);
1638                         } else {
1639                                 state = NULL;
1640                         }
1641                         if (uptodate) {
1642                                 set_state_cb(tree, clear, EXTENT_UPTODATE);
1643                                 clear->state |= EXTENT_UPTODATE;
1644                         }
1645                         clear_state_bit(tree, clear, EXTENT_LOCKED,
1646                                         1, 0);
1647                         if (cur == start)
1648                                 break;
1649                         if (cur < start) {
1650                                 WARN_ON(1);
1651                                 break;
1652                         }
1653                         if (!node)
1654                                 break;
1655                 }
1656                 /* before releasing the lock, make sure the next state
1657                  * variable has the expected bits set and corresponds
1658                  * to the correct offsets in the file
1659                  */
1660                 if (state && (state->end + 1 != start ||
1661                     !(state->state & EXTENT_LOCKED))) {
1662                         state = NULL;
1663                 }
1664                 spin_unlock_irqrestore(&tree->lock, flags);
1665 next_io:
1666                 if (whole_page) {
1667                         if (uptodate) {
1668                                 SetPageUptodate(page);
1669                         } else {
1670                                 ClearPageUptodate(page);
1671                                 SetPageError(page);
1672                         }
1673                         unlock_page(page);
1674                 } else {
1675                         if (uptodate) {
1676                                 check_page_uptodate(tree, page);
1677                         } else {
1678                                 ClearPageUptodate(page);
1679                                 SetPageError(page);
1680                         }
1681                         check_page_locked(tree, page);
1682                 }
1683         } while (bvec >= bio->bi_io_vec);
1684
1685         bio_put(bio);
1686 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1687         return 0;
1688 #endif
1689 }
1690
1691 /*
1692  * IO done from prepare_write is pretty simple, we just unlock
1693  * the structs in the extent tree when done, and set the uptodate bits
1694  * as appropriate.
1695  */
1696 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1697 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1698 #else
1699 static int end_bio_extent_preparewrite(struct bio *bio,
1700                                        unsigned int bytes_done, int err)
1701 #endif
1702 {
1703         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1704         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1705         struct extent_state *state = bio->bi_private;
1706         struct extent_io_tree *tree = state->tree;
1707         u64 start;
1708         u64 end;
1709
1710 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1711         if (bio->bi_size)
1712                 return 1;
1713 #endif
1714
1715         do {
1716                 struct page *page = bvec->bv_page;
1717                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1718                         bvec->bv_offset;
1719                 end = start + bvec->bv_len - 1;
1720
1721                 if (--bvec >= bio->bi_io_vec)
1722                         prefetchw(&bvec->bv_page->flags);
1723
1724                 if (uptodate) {
1725                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1726                 } else {
1727                         ClearPageUptodate(page);
1728                         SetPageError(page);
1729                 }
1730
1731                 unlock_extent(tree, start, end, GFP_ATOMIC);
1732
1733         } while (bvec >= bio->bi_io_vec);
1734
1735         bio_put(bio);
1736 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1737         return 0;
1738 #endif
1739 }
1740
1741 static struct bio *
1742 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1743                  gfp_t gfp_flags)
1744 {
1745         struct bio *bio;
1746
1747         bio = bio_alloc(gfp_flags, nr_vecs);
1748
1749         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1750                 while (!bio && (nr_vecs /= 2))
1751                         bio = bio_alloc(gfp_flags, nr_vecs);
1752         }
1753
1754         if (bio) {
1755                 bio->bi_size = 0;
1756                 bio->bi_bdev = bdev;
1757                 bio->bi_sector = first_sector;
1758         }
1759         return bio;
1760 }
1761
1762 static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
1763 {
1764         int ret = 0;
1765         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1766         struct page *page = bvec->bv_page;
1767         struct extent_io_tree *tree = bio->bi_private;
1768         struct rb_node *node;
1769         struct extent_state *state;
1770         u64 start;
1771         u64 end;
1772
1773         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1774         end = start + bvec->bv_len - 1;
1775
1776         spin_lock_irq(&tree->lock);
1777         node = __etree_search(tree, start, NULL, NULL);
1778         BUG_ON(!node);
1779         state = rb_entry(node, struct extent_state, rb_node);
1780         while(state->end < end) {
1781                 node = rb_next(node);
1782                 state = rb_entry(node, struct extent_state, rb_node);
1783         }
1784         BUG_ON(state->end != end);
1785         spin_unlock_irq(&tree->lock);
1786
1787         bio->bi_private = state;
1788
1789         bio_get(bio);
1790
1791         if (tree->ops && tree->ops->submit_bio_hook)
1792                 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1793                                            mirror_num);
1794         else
1795                 submit_bio(rw, bio);
1796         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1797                 ret = -EOPNOTSUPP;
1798         bio_put(bio);
1799         return ret;
1800 }
1801
1802 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1803                               struct page *page, sector_t sector,
1804                               size_t size, unsigned long offset,
1805                               struct block_device *bdev,
1806                               struct bio **bio_ret,
1807                               unsigned long max_pages,
1808                               bio_end_io_t end_io_func,
1809                               int mirror_num)
1810 {
1811         int ret = 0;
1812         struct bio *bio;
1813         int nr;
1814
1815         if (bio_ret && *bio_ret) {
1816                 bio = *bio_ret;
1817                 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1818                     (tree->ops && tree->ops->merge_bio_hook &&
1819                      tree->ops->merge_bio_hook(page, offset, size, bio)) ||
1820                     bio_add_page(bio, page, size, offset) < size) {
1821                         ret = submit_one_bio(rw, bio, mirror_num);
1822                         bio = NULL;
1823                 } else {
1824                         return 0;
1825                 }
1826         }
1827         nr = bio_get_nr_vecs(bdev);
1828         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1829         if (!bio) {
1830                 printk("failed to allocate bio nr %d\n", nr);
1831         }
1832
1833
1834         bio_add_page(bio, page, size, offset);
1835         bio->bi_end_io = end_io_func;
1836         bio->bi_private = tree;
1837
1838         if (bio_ret) {
1839                 *bio_ret = bio;
1840         } else {
1841                 ret = submit_one_bio(rw, bio, mirror_num);
1842         }
1843
1844         return ret;
1845 }
1846
1847 void set_page_extent_mapped(struct page *page)
1848 {
1849         if (!PagePrivate(page)) {
1850                 SetPagePrivate(page);
1851                 page_cache_get(page);
1852                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1853         }
1854 }
1855
1856 void set_page_extent_head(struct page *page, unsigned long len)
1857 {
1858         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1859 }
1860
1861 /*
1862  * basic readpage implementation.  Locked extent state structs are inserted
1863  * into the tree that are removed when the IO is done (by the end_io
1864  * handlers)
1865  */
1866 static int __extent_read_full_page(struct extent_io_tree *tree,
1867                                    struct page *page,
1868                                    get_extent_t *get_extent,
1869                                    struct bio **bio, int mirror_num)
1870 {
1871         struct inode *inode = page->mapping->host;
1872         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1873         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1874         u64 end;
1875         u64 cur = start;
1876         u64 extent_offset;
1877         u64 last_byte = i_size_read(inode);
1878         u64 block_start;
1879         u64 cur_end;
1880         sector_t sector;
1881         struct extent_map *em;
1882         struct block_device *bdev;
1883         int ret;
1884         int nr = 0;
1885         size_t page_offset = 0;
1886         size_t iosize;
1887         size_t blocksize = inode->i_sb->s_blocksize;
1888
1889         set_page_extent_mapped(page);
1890
1891         end = page_end;
1892         lock_extent(tree, start, end, GFP_NOFS);
1893
1894         while (cur <= end) {
1895                 if (cur >= last_byte) {
1896                         char *userpage;
1897                         iosize = PAGE_CACHE_SIZE - page_offset;
1898                         userpage = kmap_atomic(page, KM_USER0);
1899                         memset(userpage + page_offset, 0, iosize);
1900                         flush_dcache_page(page);
1901                         kunmap_atomic(userpage, KM_USER0);
1902                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1903                                             GFP_NOFS);
1904                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1905                         break;
1906                 }
1907                 em = get_extent(inode, page, page_offset, cur,
1908                                 end - cur + 1, 0);
1909                 if (IS_ERR(em) || !em) {
1910                         SetPageError(page);
1911                         unlock_extent(tree, cur, end, GFP_NOFS);
1912                         break;
1913                 }
1914                 extent_offset = cur - em->start;
1915                 if (extent_map_end(em) <= cur) {
1916 printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
1917                 }
1918                 BUG_ON(extent_map_end(em) <= cur);
1919                 if (end < cur) {
1920 printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1921                 }
1922                 BUG_ON(end < cur);
1923
1924                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1925                 cur_end = min(extent_map_end(em) - 1, end);
1926                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1927                 sector = (em->block_start + extent_offset) >> 9;
1928                 bdev = em->bdev;
1929                 block_start = em->block_start;
1930                 free_extent_map(em);
1931                 em = NULL;
1932
1933                 /* we've found a hole, just zero and go on */
1934                 if (block_start == EXTENT_MAP_HOLE) {
1935                         char *userpage;
1936                         userpage = kmap_atomic(page, KM_USER0);
1937                         memset(userpage + page_offset, 0, iosize);
1938                         flush_dcache_page(page);
1939                         kunmap_atomic(userpage, KM_USER0);
1940
1941                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1942                                             GFP_NOFS);
1943                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1944                         cur = cur + iosize;
1945                         page_offset += iosize;
1946                         continue;
1947                 }
1948                 /* the get_extent function already copied into the page */
1949                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1950                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1951                         cur = cur + iosize;
1952                         page_offset += iosize;
1953                         continue;
1954                 }
1955                 /* we have an inline extent but it didn't get marked up
1956                  * to date.  Error out
1957                  */
1958                 if (block_start == EXTENT_MAP_INLINE) {
1959                         SetPageError(page);
1960                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1961                         cur = cur + iosize;
1962                         page_offset += iosize;
1963                         continue;
1964                 }
1965
1966                 ret = 0;
1967                 if (tree->ops && tree->ops->readpage_io_hook) {
1968                         ret = tree->ops->readpage_io_hook(page, cur,
1969                                                           cur + iosize - 1);
1970                 }
1971                 if (!ret) {
1972                         unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1973                         nr -= page->index;
1974                         ret = submit_extent_page(READ, tree, page,
1975                                          sector, iosize, page_offset,
1976                                          bdev, bio, nr,
1977                                          end_bio_extent_readpage, mirror_num);
1978                 }
1979                 if (ret)
1980                         SetPageError(page);
1981                 cur = cur + iosize;
1982                 page_offset += iosize;
1983                 nr++;
1984         }
1985         if (!nr) {
1986                 if (!PageError(page))
1987                         SetPageUptodate(page);
1988                 unlock_page(page);
1989         }
1990         return 0;
1991 }
1992
1993 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1994                             get_extent_t *get_extent)
1995 {
1996         struct bio *bio = NULL;
1997         int ret;
1998
1999         ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
2000         if (bio)
2001                 submit_one_bio(READ, bio, 0);
2002         return ret;
2003 }
2004 EXPORT_SYMBOL(extent_read_full_page);
2005
2006 /*
2007  * the writepage semantics are similar to regular writepage.  extent
2008  * records are inserted to lock ranges in the tree, and as dirty areas
2009  * are found, they are marked writeback.  Then the lock bits are removed
2010  * and the end_io handler clears the writeback ranges
2011  */
2012 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2013                               void *data)
2014 {
2015         struct inode *inode = page->mapping->host;
2016         struct extent_page_data *epd = data;
2017         struct extent_io_tree *tree = epd->tree;
2018         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2019         u64 delalloc_start;
2020         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2021         u64 end;
2022         u64 cur = start;
2023         u64 extent_offset;
2024         u64 last_byte = i_size_read(inode);
2025         u64 block_start;
2026         u64 iosize;
2027         u64 unlock_start;
2028         sector_t sector;
2029         struct extent_map *em;
2030         struct block_device *bdev;
2031         int ret;
2032         int nr = 0;
2033         size_t pg_offset = 0;
2034         size_t blocksize;
2035         loff_t i_size = i_size_read(inode);
2036         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2037         u64 nr_delalloc;
2038         u64 delalloc_end;
2039
2040         WARN_ON(!PageLocked(page));
2041         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2042         if (page->index > end_index ||
2043            (page->index == end_index && !pg_offset)) {
2044                 page->mapping->a_ops->invalidatepage(page, 0);
2045                 unlock_page(page);
2046                 return 0;
2047         }
2048
2049         if (page->index == end_index) {
2050                 char *userpage;
2051
2052                 userpage = kmap_atomic(page, KM_USER0);
2053                 memset(userpage + pg_offset, 0,
2054                        PAGE_CACHE_SIZE - pg_offset);
2055                 kunmap_atomic(userpage, KM_USER0);
2056                 flush_dcache_page(page);
2057         }
2058         pg_offset = 0;
2059
2060         set_page_extent_mapped(page);
2061
2062         delalloc_start = start;
2063         delalloc_end = 0;
2064         while(delalloc_end < page_end) {
2065                 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
2066                                                        &delalloc_end,
2067                                                        128 * 1024 * 1024);
2068                 if (nr_delalloc == 0) {
2069                         delalloc_start = delalloc_end + 1;
2070                         continue;
2071                 }
2072                 tree->ops->fill_delalloc(inode, delalloc_start,
2073                                          delalloc_end);
2074                 clear_extent_bit(tree, delalloc_start,
2075                                  delalloc_end,
2076                                  EXTENT_LOCKED | EXTENT_DELALLOC,
2077                                  1, 0, GFP_NOFS);
2078                 delalloc_start = delalloc_end + 1;
2079         }
2080         lock_extent(tree, start, page_end, GFP_NOFS);
2081         unlock_start = start;
2082
2083         if (tree->ops && tree->ops->writepage_start_hook) {
2084                 ret = tree->ops->writepage_start_hook(page, start, page_end);
2085                 if (ret == -EAGAIN) {
2086                         unlock_extent(tree, start, page_end, GFP_NOFS);
2087                         redirty_page_for_writepage(wbc, page);
2088                         unlock_page(page);
2089                         return 0;
2090                 }
2091         }
2092
2093         end = page_end;
2094         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
2095                 printk("found delalloc bits after lock_extent\n");
2096         }
2097
2098         if (last_byte <= start) {
2099                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2100                 unlock_extent(tree, start, page_end, GFP_NOFS);
2101                 if (tree->ops && tree->ops->writepage_end_io_hook)
2102                         tree->ops->writepage_end_io_hook(page, start,
2103                                                          page_end, NULL, 1);
2104                 unlock_start = page_end + 1;
2105                 goto done;
2106         }
2107
2108         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2109         blocksize = inode->i_sb->s_blocksize;
2110
2111         while (cur <= end) {
2112                 if (cur >= last_byte) {
2113                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2114                         unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2115                         if (tree->ops && tree->ops->writepage_end_io_hook)
2116                                 tree->ops->writepage_end_io_hook(page, cur,
2117                                                          page_end, NULL, 1);
2118                         unlock_start = page_end + 1;
2119                         break;
2120                 }
2121                 em = epd->get_extent(inode, page, pg_offset, cur,
2122                                      end - cur + 1, 1);
2123                 if (IS_ERR(em) || !em) {
2124                         SetPageError(page);
2125                         break;
2126                 }
2127
2128                 extent_offset = cur - em->start;
2129                 BUG_ON(extent_map_end(em) <= cur);
2130                 BUG_ON(end < cur);
2131                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2132                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2133                 sector = (em->block_start + extent_offset) >> 9;
2134                 bdev = em->bdev;
2135                 block_start = em->block_start;
2136                 free_extent_map(em);
2137                 em = NULL;
2138
2139                 if (block_start == EXTENT_MAP_HOLE ||
2140                     block_start == EXTENT_MAP_INLINE) {
2141                         clear_extent_dirty(tree, cur,
2142                                            cur + iosize - 1, GFP_NOFS);
2143
2144                         unlock_extent(tree, unlock_start, cur + iosize -1,
2145                                       GFP_NOFS);
2146
2147                         if (tree->ops && tree->ops->writepage_end_io_hook)
2148                                 tree->ops->writepage_end_io_hook(page, cur,
2149                                                          cur + iosize - 1,
2150                                                          NULL, 1);
2151                         cur = cur + iosize;
2152                         pg_offset += iosize;
2153                         unlock_start = cur;
2154                         continue;
2155                 }
2156
2157                 /* leave this out until we have a page_mkwrite call */
2158                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2159                                    EXTENT_DIRTY, 0)) {
2160                         cur = cur + iosize;
2161                         pg_offset += iosize;
2162                         continue;
2163                 }
2164                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2165                 if (tree->ops && tree->ops->writepage_io_hook) {
2166                         ret = tree->ops->writepage_io_hook(page, cur,
2167                                                 cur + iosize - 1);
2168                 } else {
2169                         ret = 0;
2170                 }
2171                 if (ret) {
2172                         SetPageError(page);
2173                 } else {
2174                         unsigned long max_nr = end_index + 1;
2175
2176                         set_range_writeback(tree, cur, cur + iosize - 1);
2177                         if (!PageWriteback(page)) {
2178                                 printk("warning page %lu not writeback, "
2179                                        "cur %llu end %llu\n", page->index,
2180                                        (unsigned long long)cur,
2181                                        (unsigned long long)end);
2182                         }
2183
2184                         ret = submit_extent_page(WRITE, tree, page, sector,
2185                                                  iosize, pg_offset, bdev,
2186                                                  &epd->bio, max_nr,
2187                                                  end_bio_extent_writepage, 0);
2188                         if (ret)
2189                                 SetPageError(page);
2190                 }
2191                 cur = cur + iosize;
2192                 pg_offset += iosize;
2193                 nr++;
2194         }
2195 done:
2196         if (nr == 0) {
2197                 /* make sure the mapping tag for page dirty gets cleared */
2198                 set_page_writeback(page);
2199                 end_page_writeback(page);
2200         }
2201         if (unlock_start <= page_end)
2202                 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2203         unlock_page(page);
2204         return 0;
2205 }
2206
2207 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
2208 /* Taken directly from 2.6.23 for 2.6.18 back port */
2209 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
2210                                 void *data);
2211
2212 /**
2213  * write_cache_pages - walk the list of dirty pages of the given address space
2214  * and write all of them.
2215  * @mapping: address space structure to write
2216  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2217  * @writepage: function called for each page
2218  * @data: data passed to writepage function
2219  *
2220  * If a page is already under I/O, write_cache_pages() skips it, even
2221  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2222  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2223  * and msync() need to guarantee that all the data which was dirty at the time
2224  * the call was made get new I/O started against them.  If wbc->sync_mode is
2225  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2226  * existing IO to complete.
2227  */
2228 static int write_cache_pages(struct address_space *mapping,
2229                       struct writeback_control *wbc, writepage_t writepage,
2230                       void *data)
2231 {
2232         struct backing_dev_info *bdi = mapping->backing_dev_info;
2233         int ret = 0;
2234         int done = 0;
2235         struct pagevec pvec;
2236         int nr_pages;
2237         pgoff_t index;
2238         pgoff_t end;            /* Inclusive */
2239         int scanned = 0;
2240         int range_whole = 0;
2241
2242         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2243                 wbc->encountered_congestion = 1;
2244                 return 0;
2245         }
2246
2247         pagevec_init(&pvec, 0);
2248         if (wbc->range_cyclic) {
2249                 index = mapping->writeback_index; /* Start from prev offset */
2250                 end = -1;
2251         } else {
2252                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2253                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2254                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2255                         range_whole = 1;
2256                 scanned = 1;
2257         }
2258 retry:
2259         while (!done && (index <= end) &&
2260                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2261                                               PAGECACHE_TAG_DIRTY,
2262                                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2263                 unsigned i;
2264
2265                 scanned = 1;
2266                 for (i = 0; i < nr_pages; i++) {
2267                         struct page *page = pvec.pages[i];
2268
2269                         /*
2270                          * At this point we hold neither mapping->tree_lock nor
2271                          * lock on the page itself: the page may be truncated or
2272                          * invalidated (changing page->mapping to NULL), or even
2273                          * swizzled back from swapper_space to tmpfs file
2274                          * mapping
2275                          */
2276                         lock_page(page);
2277
2278                         if (unlikely(page->mapping != mapping)) {
2279                                 unlock_page(page);
2280                                 continue;
2281                         }
2282
2283                         if (!wbc->range_cyclic && page->index > end) {
2284                                 done = 1;
2285                                 unlock_page(page);
2286                                 continue;
2287                         }
2288
2289                         if (wbc->sync_mode != WB_SYNC_NONE)
2290                                 wait_on_page_writeback(page);
2291
2292                         if (PageWriteback(page) ||
2293                             !clear_page_dirty_for_io(page)) {
2294                                 unlock_page(page);
2295                                 continue;
2296                         }
2297
2298                         ret = (*writepage)(page, wbc, data);
2299
2300                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2301                                 unlock_page(page);
2302                                 ret = 0;
2303                         }
2304                         if (ret || (--(wbc->nr_to_write) <= 0))
2305                                 done = 1;
2306                         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2307                                 wbc->encountered_congestion = 1;
2308                                 done = 1;
2309                         }
2310                 }
2311                 pagevec_release(&pvec);
2312                 cond_resched();
2313         }
2314         if (!scanned && !done) {
2315                 /*
2316                  * We hit the last page and there is more work to be done: wrap
2317                  * back to the start of the file
2318                  */
2319                 scanned = 1;
2320                 index = 0;
2321                 goto retry;
2322         }
2323         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2324                 mapping->writeback_index = index;
2325         return ret;
2326 }
2327 #endif
2328
2329 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2330                           get_extent_t *get_extent,
2331                           struct writeback_control *wbc)
2332 {
2333         int ret;
2334         struct address_space *mapping = page->mapping;
2335         struct extent_page_data epd = {
2336                 .bio = NULL,
2337                 .tree = tree,
2338                 .get_extent = get_extent,
2339         };
2340         struct writeback_control wbc_writepages = {
2341                 .bdi            = wbc->bdi,
2342                 .sync_mode      = WB_SYNC_NONE,
2343                 .older_than_this = NULL,
2344                 .nr_to_write    = 64,
2345                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2346                 .range_end      = (loff_t)-1,
2347         };
2348
2349
2350         ret = __extent_writepage(page, wbc, &epd);
2351
2352         write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
2353         if (epd.bio) {
2354                 submit_one_bio(WRITE, epd.bio, 0);
2355         }
2356         return ret;
2357 }
2358 EXPORT_SYMBOL(extent_write_full_page);
2359
2360
2361 int extent_writepages(struct extent_io_tree *tree,
2362                       struct address_space *mapping,
2363                       get_extent_t *get_extent,
2364                       struct writeback_control *wbc)
2365 {
2366         int ret = 0;
2367         struct extent_page_data epd = {
2368                 .bio = NULL,
2369                 .tree = tree,
2370                 .get_extent = get_extent,
2371         };
2372
2373         ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
2374         if (epd.bio) {
2375                 submit_one_bio(WRITE, epd.bio, 0);
2376         }
2377         return ret;
2378 }
2379 EXPORT_SYMBOL(extent_writepages);
2380
2381 int extent_readpages(struct extent_io_tree *tree,
2382                      struct address_space *mapping,
2383                      struct list_head *pages, unsigned nr_pages,
2384                      get_extent_t get_extent)
2385 {
2386         struct bio *bio = NULL;
2387         unsigned page_idx;
2388         struct pagevec pvec;
2389
2390         pagevec_init(&pvec, 0);
2391         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2392                 struct page *page = list_entry(pages->prev, struct page, lru);
2393
2394                 prefetchw(&page->flags);
2395                 list_del(&page->lru);
2396                 /*
2397                  * what we want to do here is call add_to_page_cache_lru,
2398                  * but that isn't exported, so we reproduce it here
2399                  */
2400                 if (!add_to_page_cache(page, mapping,
2401                                         page->index, GFP_KERNEL)) {
2402
2403                         /* open coding of lru_cache_add, also not exported */
2404                         page_cache_get(page);
2405                         if (!pagevec_add(&pvec, page))
2406                                 __pagevec_lru_add(&pvec);
2407                         __extent_read_full_page(tree, page, get_extent,
2408                                                 &bio, 0);
2409                 }
2410                 page_cache_release(page);
2411         }
2412         if (pagevec_count(&pvec))
2413                 __pagevec_lru_add(&pvec);
2414         BUG_ON(!list_empty(pages));
2415         if (bio)
2416                 submit_one_bio(READ, bio, 0);
2417         return 0;
2418 }
2419 EXPORT_SYMBOL(extent_readpages);
2420
2421 /*
2422  * basic invalidatepage code, this waits on any locked or writeback
2423  * ranges corresponding to the page, and then deletes any extent state
2424  * records from the tree
2425  */
2426 int extent_invalidatepage(struct extent_io_tree *tree,
2427                           struct page *page, unsigned long offset)
2428 {
2429         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2430         u64 end = start + PAGE_CACHE_SIZE - 1;
2431         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2432
2433         start += (offset + blocksize -1) & ~(blocksize - 1);
2434         if (start > end)
2435                 return 0;
2436
2437         lock_extent(tree, start, end, GFP_NOFS);
2438         wait_on_extent_writeback(tree, start, end);
2439         clear_extent_bit(tree, start, end,
2440                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2441                          1, 1, GFP_NOFS);
2442         return 0;
2443 }
2444 EXPORT_SYMBOL(extent_invalidatepage);
2445
2446 /*
2447  * simple commit_write call, set_range_dirty is used to mark both
2448  * the pages and the extent records as dirty
2449  */
2450 int extent_commit_write(struct extent_io_tree *tree,
2451                         struct inode *inode, struct page *page,
2452                         unsigned from, unsigned to)
2453 {
2454         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2455
2456         set_page_extent_mapped(page);
2457         set_page_dirty(page);
2458
2459         if (pos > inode->i_size) {
2460                 i_size_write(inode, pos);
2461                 mark_inode_dirty(inode);
2462         }
2463         return 0;
2464 }
2465 EXPORT_SYMBOL(extent_commit_write);
2466
2467 int extent_prepare_write(struct extent_io_tree *tree,
2468                          struct inode *inode, struct page *page,
2469                          unsigned from, unsigned to, get_extent_t *get_extent)
2470 {
2471         u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2472         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2473         u64 block_start;
2474         u64 orig_block_start;
2475         u64 block_end;
2476         u64 cur_end;
2477         struct extent_map *em;
2478         unsigned blocksize = 1 << inode->i_blkbits;
2479         size_t page_offset = 0;
2480         size_t block_off_start;
2481         size_t block_off_end;
2482         int err = 0;
2483         int iocount = 0;
2484         int ret = 0;
2485         int isnew;
2486
2487         set_page_extent_mapped(page);
2488
2489         block_start = (page_start + from) & ~((u64)blocksize - 1);
2490         block_end = (page_start + to - 1) | (blocksize - 1);
2491         orig_block_start = block_start;
2492
2493         lock_extent(tree, page_start, page_end, GFP_NOFS);
2494         while(block_start <= block_end) {
2495                 em = get_extent(inode, page, page_offset, block_start,
2496                                 block_end - block_start + 1, 1);
2497                 if (IS_ERR(em) || !em) {
2498                         goto err;
2499                 }
2500                 cur_end = min(block_end, extent_map_end(em) - 1);
2501                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2502                 block_off_end = block_off_start + blocksize;
2503                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2504
2505                 if (!PageUptodate(page) && isnew &&
2506                     (block_off_end > to || block_off_start < from)) {
2507                         void *kaddr;
2508
2509                         kaddr = kmap_atomic(page, KM_USER0);
2510                         if (block_off_end > to)
2511                                 memset(kaddr + to, 0, block_off_end - to);
2512                         if (block_off_start < from)
2513                                 memset(kaddr + block_off_start, 0,
2514                                        from - block_off_start);
2515                         flush_dcache_page(page);
2516                         kunmap_atomic(kaddr, KM_USER0);
2517                 }
2518                 if ((em->block_start != EXTENT_MAP_HOLE &&
2519                      em->block_start != EXTENT_MAP_INLINE) &&
2520                     !isnew && !PageUptodate(page) &&
2521                     (block_off_end > to || block_off_start < from) &&
2522                     !test_range_bit(tree, block_start, cur_end,
2523                                     EXTENT_UPTODATE, 1)) {
2524                         u64 sector;
2525                         u64 extent_offset = block_start - em->start;
2526                         size_t iosize;
2527                         sector = (em->block_start + extent_offset) >> 9;
2528                         iosize = (cur_end - block_start + blocksize) &
2529                                 ~((u64)blocksize - 1);
2530                         /*
2531                          * we've already got the extent locked, but we
2532                          * need to split the state such that our end_bio
2533                          * handler can clear the lock.
2534                          */
2535                         set_extent_bit(tree, block_start,
2536                                        block_start + iosize - 1,
2537                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2538                         ret = submit_extent_page(READ, tree, page,
2539                                          sector, iosize, page_offset, em->bdev,
2540                                          NULL, 1,
2541                                          end_bio_extent_preparewrite, 0);
2542                         iocount++;
2543                         block_start = block_start + iosize;
2544                 } else {
2545                         set_extent_uptodate(tree, block_start, cur_end,
2546                                             GFP_NOFS);
2547                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2548                         block_start = cur_end + 1;
2549                 }
2550                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2551                 free_extent_map(em);
2552         }
2553         if (iocount) {
2554                 wait_extent_bit(tree, orig_block_start,
2555                                 block_end, EXTENT_LOCKED);
2556         }
2557         check_page_uptodate(tree, page);
2558 err:
2559         /* FIXME, zero out newly allocated blocks on error */
2560         return err;
2561 }
2562 EXPORT_SYMBOL(extent_prepare_write);
2563
2564 /*
2565  * a helper for releasepage, this tests for areas of the page that
2566  * are locked or under IO and drops the related state bits if it is safe
2567  * to drop the page.
2568  */
2569 int try_release_extent_state(struct extent_map_tree *map,
2570                              struct extent_io_tree *tree, struct page *page,
2571                              gfp_t mask)
2572 {
2573         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2574         u64 end = start + PAGE_CACHE_SIZE - 1;
2575         int ret = 1;
2576
2577         if (test_range_bit(tree, start, end,
2578                            EXTENT_IOBITS | EXTENT_ORDERED, 0))
2579                 ret = 0;
2580         else {
2581                 if ((mask & GFP_NOFS) == GFP_NOFS)
2582                         mask = GFP_NOFS;
2583                 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2584                                  1, 1, mask);
2585         }
2586         return ret;
2587 }
2588 EXPORT_SYMBOL(try_release_extent_state);
2589
2590 /*
2591  * a helper for releasepage.  As long as there are no locked extents
2592  * in the range corresponding to the page, both state records and extent
2593  * map records are removed
2594  */
2595 int try_release_extent_mapping(struct extent_map_tree *map,
2596                                struct extent_io_tree *tree, struct page *page,
2597                                gfp_t mask)
2598 {
2599         struct extent_map *em;
2600         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2601         u64 end = start + PAGE_CACHE_SIZE - 1;
2602
2603         if ((mask & __GFP_WAIT) &&
2604             page->mapping->host->i_size > 16 * 1024 * 1024) {
2605                 u64 len;
2606                 while (start <= end) {
2607                         len = end - start + 1;
2608                         spin_lock(&map->lock);
2609                         em = lookup_extent_mapping(map, start, len);
2610                         if (!em || IS_ERR(em)) {
2611                                 spin_unlock(&map->lock);
2612                                 break;
2613                         }
2614                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2615                             em->start != start) {
2616                                 spin_unlock(&map->lock);
2617                                 free_extent_map(em);
2618                                 break;
2619                         }
2620                         if (!test_range_bit(tree, em->start,
2621                                             extent_map_end(em) - 1,
2622                                             EXTENT_LOCKED, 0)) {
2623                                 remove_extent_mapping(map, em);
2624                                 /* once for the rb tree */
2625                                 free_extent_map(em);
2626                         }
2627                         start = extent_map_end(em);
2628                         spin_unlock(&map->lock);
2629
2630                         /* once for us */
2631                         free_extent_map(em);
2632                 }
2633         }
2634         return try_release_extent_state(map, tree, page, mask);
2635 }
2636 EXPORT_SYMBOL(try_release_extent_mapping);
2637
2638 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2639                 get_extent_t *get_extent)
2640 {
2641         struct inode *inode = mapping->host;
2642         u64 start = iblock << inode->i_blkbits;
2643         sector_t sector = 0;
2644         struct extent_map *em;
2645
2646         em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2647         if (!em || IS_ERR(em))
2648                 return 0;
2649
2650         if (em->block_start == EXTENT_MAP_INLINE ||
2651             em->block_start == EXTENT_MAP_HOLE)
2652                 goto out;
2653
2654         sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2655 out:
2656         free_extent_map(em);
2657         return sector;
2658 }
2659
2660 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2661                                               unsigned long i)
2662 {
2663         struct page *p;
2664         struct address_space *mapping;
2665
2666         if (i == 0)
2667                 return eb->first_page;
2668         i += eb->start >> PAGE_CACHE_SHIFT;
2669         mapping = eb->first_page->mapping;
2670         read_lock_irq(&mapping->tree_lock);
2671         p = radix_tree_lookup(&mapping->page_tree, i);
2672         read_unlock_irq(&mapping->tree_lock);
2673         return p;
2674 }
2675
2676 static inline unsigned long num_extent_pages(u64 start, u64 len)
2677 {
2678         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2679                 (start >> PAGE_CACHE_SHIFT);
2680 }
2681
2682 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2683                                                    u64 start,
2684                                                    unsigned long len,
2685                                                    gfp_t mask)
2686 {
2687         struct extent_buffer *eb = NULL;
2688         unsigned long flags;
2689
2690         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2691         eb->start = start;
2692         eb->len = len;
2693         spin_lock_irqsave(&leak_lock, flags);
2694         list_add(&eb->leak_list, &buffers);
2695         spin_unlock_irqrestore(&leak_lock, flags);
2696         atomic_set(&eb->refs, 1);
2697
2698         return eb;
2699 }
2700
2701 static void __free_extent_buffer(struct extent_buffer *eb)
2702 {
2703         unsigned long flags;
2704         spin_lock_irqsave(&leak_lock, flags);
2705         list_del(&eb->leak_list);
2706         spin_unlock_irqrestore(&leak_lock, flags);
2707         kmem_cache_free(extent_buffer_cache, eb);
2708 }
2709
2710 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2711                                           u64 start, unsigned long len,
2712                                           struct page *page0,
2713                                           gfp_t mask)
2714 {
2715         unsigned long num_pages = num_extent_pages(start, len);
2716         unsigned long i;
2717         unsigned long index = start >> PAGE_CACHE_SHIFT;
2718         struct extent_buffer *eb;
2719         struct extent_buffer *exists = NULL;
2720         struct page *p;
2721         struct address_space *mapping = tree->mapping;
2722         int uptodate = 1;
2723
2724         spin_lock(&tree->buffer_lock);
2725         eb = buffer_search(tree, start);
2726         if (eb) {
2727                 atomic_inc(&eb->refs);
2728                 spin_unlock(&tree->buffer_lock);
2729                 return eb;
2730         }
2731         spin_unlock(&tree->buffer_lock);
2732
2733         eb = __alloc_extent_buffer(tree, start, len, mask);
2734         if (!eb)
2735                 return NULL;
2736
2737         if (page0) {
2738                 eb->first_page = page0;
2739                 i = 1;
2740                 index++;
2741                 page_cache_get(page0);
2742                 mark_page_accessed(page0);
2743                 set_page_extent_mapped(page0);
2744                 set_page_extent_head(page0, len);
2745                 uptodate = PageUptodate(page0);
2746         } else {
2747                 i = 0;
2748         }
2749         for (; i < num_pages; i++, index++) {
2750                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2751                 if (!p) {
2752                         WARN_ON(1);
2753                         goto free_eb;
2754                 }
2755                 set_page_extent_mapped(p);
2756                 mark_page_accessed(p);
2757                 if (i == 0) {
2758                         eb->first_page = p;
2759                         set_page_extent_head(p, len);
2760                 } else {
2761                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2762                 }
2763                 if (!PageUptodate(p))
2764                         uptodate = 0;
2765                 unlock_page(p);
2766         }
2767         if (uptodate)
2768                 eb->flags |= EXTENT_UPTODATE;
2769         eb->flags |= EXTENT_BUFFER_FILLED;
2770
2771         spin_lock(&tree->buffer_lock);
2772         exists = buffer_tree_insert(tree, start, &eb->rb_node);
2773         if (exists) {
2774                 /* add one reference for the caller */
2775                 atomic_inc(&exists->refs);
2776                 spin_unlock(&tree->buffer_lock);
2777                 goto free_eb;
2778         }
2779         spin_unlock(&tree->buffer_lock);
2780
2781         /* add one reference for the tree */
2782         atomic_inc(&eb->refs);
2783         return eb;
2784
2785 free_eb:
2786         if (!atomic_dec_and_test(&eb->refs))
2787                 return exists;
2788         for (index = 1; index < i; index++)
2789                 page_cache_release(extent_buffer_page(eb, index));
2790         page_cache_release(extent_buffer_page(eb, 0));
2791         __free_extent_buffer(eb);
2792         return exists;
2793 }
2794 EXPORT_SYMBOL(alloc_extent_buffer);
2795
2796 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2797                                          u64 start, unsigned long len,
2798                                           gfp_t mask)
2799 {
2800         struct extent_buffer *eb;
2801
2802         spin_lock(&tree->buffer_lock);
2803         eb = buffer_search(tree, start);
2804         if (eb)
2805                 atomic_inc(&eb->refs);
2806         spin_unlock(&tree->buffer_lock);
2807
2808         return eb;
2809 }
2810 EXPORT_SYMBOL(find_extent_buffer);
2811
2812 void free_extent_buffer(struct extent_buffer *eb)
2813 {
2814         if (!eb)
2815                 return;
2816
2817         if (!atomic_dec_and_test(&eb->refs))
2818                 return;
2819
2820         WARN_ON(1);
2821 }
2822 EXPORT_SYMBOL(free_extent_buffer);
2823
2824 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2825                               struct extent_buffer *eb)
2826 {
2827         int set;
2828         unsigned long i;
2829         unsigned long num_pages;
2830         struct page *page;
2831
2832         u64 start = eb->start;
2833         u64 end = start + eb->len - 1;
2834
2835         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2836         num_pages = num_extent_pages(eb->start, eb->len);
2837
2838         for (i = 0; i < num_pages; i++) {
2839                 page = extent_buffer_page(eb, i);
2840                 if (i == 0)
2841                         set_page_extent_head(page, eb->len);
2842                 else
2843                         set_page_private(page, EXTENT_PAGE_PRIVATE);
2844
2845                 /*
2846                  * if we're on the last page or the first page and the
2847                  * block isn't aligned on a page boundary, do extra checks
2848                  * to make sure we don't clean page that is partially dirty
2849                  */
2850                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2851                     ((i == num_pages - 1) &&
2852                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2853                         start = (u64)page->index << PAGE_CACHE_SHIFT;
2854                         end  = start + PAGE_CACHE_SIZE - 1;
2855                         if (test_range_bit(tree, start, end,
2856                                            EXTENT_DIRTY, 0)) {
2857                                 continue;
2858                         }
2859                 }
2860                 clear_page_dirty_for_io(page);
2861                 read_lock_irq(&page->mapping->tree_lock);
2862                 if (!PageDirty(page)) {
2863                         radix_tree_tag_clear(&page->mapping->page_tree,
2864                                                 page_index(page),
2865                                                 PAGECACHE_TAG_DIRTY);
2866                 }
2867                 read_unlock_irq(&page->mapping->tree_lock);
2868         }
2869         return 0;
2870 }
2871 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2872
2873 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2874                                     struct extent_buffer *eb)
2875 {
2876         return wait_on_extent_writeback(tree, eb->start,
2877                                         eb->start + eb->len - 1);
2878 }
2879 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2880
2881 int set_extent_buffer_dirty(struct extent_io_tree *tree,
2882                              struct extent_buffer *eb)
2883 {
2884         unsigned long i;
2885         unsigned long num_pages;
2886
2887         num_pages = num_extent_pages(eb->start, eb->len);
2888         for (i = 0; i < num_pages; i++) {
2889                 struct page *page = extent_buffer_page(eb, i);
2890                 /* writepage may need to do something special for the
2891                  * first page, we have to make sure page->private is
2892                  * properly set.  releasepage may drop page->private
2893                  * on us if the page isn't already dirty.
2894                  */
2895                 if (i == 0) {
2896                         set_page_extent_head(page, eb->len);
2897                 } else if (PagePrivate(page) &&
2898                            page->private != EXTENT_PAGE_PRIVATE) {
2899                         set_page_extent_mapped(page);
2900                 }
2901                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2902         }
2903         return set_extent_dirty(tree, eb->start,
2904                                 eb->start + eb->len - 1, GFP_NOFS);
2905 }
2906 EXPORT_SYMBOL(set_extent_buffer_dirty);
2907
2908 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
2909                                 struct extent_buffer *eb)
2910 {
2911         unsigned long i;
2912         struct page *page;
2913         unsigned long num_pages;
2914
2915         num_pages = num_extent_pages(eb->start, eb->len);
2916         eb->flags &= ~EXTENT_UPTODATE;
2917
2918         clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2919                               GFP_NOFS);
2920         for (i = 0; i < num_pages; i++) {
2921                 page = extent_buffer_page(eb, i);
2922                 ClearPageUptodate(page);
2923         }
2924         return 0;
2925 }
2926
2927 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2928                                 struct extent_buffer *eb)
2929 {
2930         unsigned long i;
2931         struct page *page;
2932         unsigned long num_pages;
2933
2934         num_pages = num_extent_pages(eb->start, eb->len);
2935
2936         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2937                             GFP_NOFS);
2938         for (i = 0; i < num_pages; i++) {
2939                 page = extent_buffer_page(eb, i);
2940                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2941                     ((i == num_pages - 1) &&
2942                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2943                         check_page_uptodate(tree, page);
2944                         continue;
2945                 }
2946                 SetPageUptodate(page);
2947         }
2948         return 0;
2949 }
2950 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2951
2952 int extent_range_uptodate(struct extent_io_tree *tree,
2953                           u64 start, u64 end)
2954 {
2955         struct page *page;
2956         int ret;
2957         int pg_uptodate = 1;
2958         int uptodate;
2959         unsigned long index;
2960
2961         ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
2962         if (ret)
2963                 return 1;
2964         while(start <= end) {
2965                 index = start >> PAGE_CACHE_SHIFT;
2966                 page = find_get_page(tree->mapping, index);
2967                 uptodate = PageUptodate(page);
2968                 page_cache_release(page);
2969                 if (!uptodate) {
2970                         pg_uptodate = 0;
2971                         break;
2972                 }
2973                 start += PAGE_CACHE_SIZE;
2974         }
2975         return pg_uptodate;
2976 }
2977
2978 int extent_buffer_uptodate(struct extent_io_tree *tree,
2979                            struct extent_buffer *eb)
2980 {
2981         int ret = 0;
2982         unsigned long num_pages;
2983         unsigned long i;
2984         struct page *page;
2985         int pg_uptodate = 1;
2986
2987         if (eb->flags & EXTENT_UPTODATE)
2988                 return 1;
2989
2990         ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2991                            EXTENT_UPTODATE, 1);
2992         if (ret)
2993                 return ret;
2994
2995         num_pages = num_extent_pages(eb->start, eb->len);
2996         for (i = 0; i < num_pages; i++) {
2997                 page = extent_buffer_page(eb, i);
2998                 if (!PageUptodate(page)) {
2999                         pg_uptodate = 0;
3000                         break;
3001                 }
3002         }
3003         return pg_uptodate;
3004 }
3005 EXPORT_SYMBOL(extent_buffer_uptodate);
3006
3007 int read_extent_buffer_pages(struct extent_io_tree *tree,
3008                              struct extent_buffer *eb,
3009                              u64 start, int wait,
3010                              get_extent_t *get_extent, int mirror_num)
3011 {
3012         unsigned long i;
3013         unsigned long start_i;
3014         struct page *page;
3015         int err;
3016         int ret = 0;
3017         int locked_pages = 0;
3018         int all_uptodate = 1;
3019         int inc_all_pages = 0;
3020         unsigned long num_pages;
3021         struct bio *bio = NULL;
3022
3023         if (eb->flags & EXTENT_UPTODATE)
3024                 return 0;
3025
3026         if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3027                            EXTENT_UPTODATE, 1)) {
3028                 return 0;
3029         }
3030
3031         if (start) {
3032                 WARN_ON(start < eb->start);
3033                 start_i = (start >> PAGE_CACHE_SHIFT) -
3034                         (eb->start >> PAGE_CACHE_SHIFT);
3035         } else {
3036                 start_i = 0;
3037         }
3038
3039         num_pages = num_extent_pages(eb->start, eb->len);
3040         for (i = start_i; i < num_pages; i++) {
3041                 page = extent_buffer_page(eb, i);
3042                 if (!wait) {
3043                         if (TestSetPageLocked(page))
3044                                 goto unlock_exit;
3045                 } else {
3046                         lock_page(page);
3047                 }
3048                 locked_pages++;
3049                 if (!PageUptodate(page)) {
3050                         all_uptodate = 0;
3051                 }
3052         }
3053         if (all_uptodate) {
3054                 if (start_i == 0)
3055                         eb->flags |= EXTENT_UPTODATE;
3056                 goto unlock_exit;
3057         }
3058
3059         for (i = start_i; i < num_pages; i++) {
3060                 page = extent_buffer_page(eb, i);
3061                 if (inc_all_pages)
3062                         page_cache_get(page);
3063                 if (!PageUptodate(page)) {
3064                         if (start_i == 0)
3065                                 inc_all_pages = 1;
3066                         ClearPageError(page);
3067                         err = __extent_read_full_page(tree, page,
3068                                                       get_extent, &bio,
3069                                                       mirror_num);
3070                         if (err) {
3071                                 ret = err;
3072                         }
3073                 } else {
3074                         unlock_page(page);
3075                 }
3076         }
3077
3078         if (bio)
3079                 submit_one_bio(READ, bio, mirror_num);
3080
3081         if (ret || !wait) {
3082                 return ret;
3083         }
3084         for (i = start_i; i < num_pages; i++) {
3085                 page = extent_buffer_page(eb, i);
3086                 wait_on_page_locked(page);
3087                 if (!PageUptodate(page)) {
3088                         ret = -EIO;
3089                 }
3090         }
3091         if (!ret)
3092                 eb->flags |= EXTENT_UPTODATE;
3093         return ret;
3094
3095 unlock_exit:
3096         i = start_i;
3097         while(locked_pages > 0) {
3098                 page = extent_buffer_page(eb, i);
3099                 i++;
3100                 unlock_page(page);
3101                 locked_pages--;
3102         }
3103         return ret;
3104 }
3105 EXPORT_SYMBOL(read_extent_buffer_pages);
3106
3107 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3108                         unsigned long start,
3109                         unsigned long len)
3110 {
3111         size_t cur;
3112         size_t offset;
3113         struct page *page;
3114         char *kaddr;
3115         char *dst = (char *)dstv;
3116         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3117         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3118
3119         WARN_ON(start > eb->len);
3120         WARN_ON(start + len > eb->start + eb->len);
3121
3122         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3123
3124         while(len > 0) {
3125                 page = extent_buffer_page(eb, i);
3126
3127                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3128                 kaddr = kmap_atomic(page, KM_USER1);
3129                 memcpy(dst, kaddr + offset, cur);
3130                 kunmap_atomic(kaddr, KM_USER1);
3131
3132                 dst += cur;
3133                 len -= cur;
3134                 offset = 0;
3135                 i++;
3136         }
3137 }
3138 EXPORT_SYMBOL(read_extent_buffer);
3139
3140 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3141                                unsigned long min_len, char **token, char **map,
3142                                unsigned long *map_start,
3143                                unsigned long *map_len, int km)
3144 {
3145         size_t offset = start & (PAGE_CACHE_SIZE - 1);
3146         char *kaddr;
3147         struct page *p;
3148         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3149         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3150         unsigned long end_i = (start_offset + start + min_len - 1) >>
3151                 PAGE_CACHE_SHIFT;
3152
3153         if (i != end_i)
3154                 return -EINVAL;
3155
3156         if (i == 0) {
3157                 offset = start_offset;
3158                 *map_start = 0;
3159         } else {
3160                 offset = 0;
3161                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3162         }
3163         if (start + min_len > eb->len) {
3164 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
3165                 WARN_ON(1);
3166         }
3167
3168         p = extent_buffer_page(eb, i);
3169         kaddr = kmap_atomic(p, km);
3170         *token = kaddr;
3171         *map = kaddr + offset;
3172         *map_len = PAGE_CACHE_SIZE - offset;
3173         return 0;
3174 }
3175 EXPORT_SYMBOL(map_private_extent_buffer);
3176
3177 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3178                       unsigned long min_len,
3179                       char **token, char **map,
3180                       unsigned long *map_start,
3181                       unsigned long *map_len, int km)
3182 {
3183         int err;
3184         int save = 0;
3185         if (eb->map_token) {
3186                 unmap_extent_buffer(eb, eb->map_token, km);
3187                 eb->map_token = NULL;
3188                 save = 1;
3189         }
3190         err = map_private_extent_buffer(eb, start, min_len, token, map,
3191                                        map_start, map_len, km);
3192         if (!err && save) {
3193                 eb->map_token = *token;
3194                 eb->kaddr = *map;
3195                 eb->map_start = *map_start;
3196                 eb->map_len = *map_len;
3197         }
3198         return err;
3199 }
3200 EXPORT_SYMBOL(map_extent_buffer);
3201
3202 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3203 {
3204         kunmap_atomic(token, km);
3205 }
3206 EXPORT_SYMBOL(unmap_extent_buffer);
3207
3208 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3209                           unsigned long start,
3210                           unsigned long len)
3211 {
3212         size_t cur;
3213         size_t offset;
3214         struct page *page;
3215         char *kaddr;
3216         char *ptr = (char *)ptrv;
3217         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3218         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3219         int ret = 0;
3220
3221         WARN_ON(start > eb->len);
3222         WARN_ON(start + len > eb->start + eb->len);
3223
3224         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3225
3226         while(len > 0) {
3227                 page = extent_buffer_page(eb, i);
3228
3229                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3230
3231                 kaddr = kmap_atomic(page, KM_USER0);
3232                 ret = memcmp(ptr, kaddr + offset, cur);
3233                 kunmap_atomic(kaddr, KM_USER0);
3234                 if (ret)
3235                         break;
3236
3237                 ptr += cur;
3238                 len -= cur;
3239                 offset = 0;
3240                 i++;
3241         }
3242         return ret;
3243 }
3244 EXPORT_SYMBOL(memcmp_extent_buffer);
3245
3246 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3247                          unsigned long start, unsigned long len)
3248 {
3249         size_t cur;
3250         size_t offset;
3251         struct page *page;
3252         char *kaddr;
3253         char *src = (char *)srcv;
3254         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3255         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3256
3257         WARN_ON(start > eb->len);
3258         WARN_ON(start + len > eb->start + eb->len);
3259
3260         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3261
3262         while(len > 0) {
3263                 page = extent_buffer_page(eb, i);
3264                 WARN_ON(!PageUptodate(page));
3265
3266                 cur = min(len, PAGE_CACHE_SIZE - offset);
3267                 kaddr = kmap_atomic(page, KM_USER1);
3268                 memcpy(kaddr + offset, src, cur);
3269                 kunmap_atomic(kaddr, KM_USER1);
3270
3271                 src += cur;
3272                 len -= cur;
3273                 offset = 0;
3274                 i++;
3275         }
3276 }
3277 EXPORT_SYMBOL(write_extent_buffer);
3278
3279 void memset_extent_buffer(struct extent_buffer *eb, char c,
3280                           unsigned long start, unsigned long len)
3281 {
3282         size_t cur;
3283         size_t offset;
3284         struct page *page;
3285         char *kaddr;
3286         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3287         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3288
3289         WARN_ON(start > eb->len);
3290         WARN_ON(start + len > eb->start + eb->len);
3291
3292         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3293
3294         while(len > 0) {
3295                 page = extent_buffer_page(eb, i);
3296                 WARN_ON(!PageUptodate(page));
3297
3298                 cur = min(len, PAGE_CACHE_SIZE - offset);
3299                 kaddr = kmap_atomic(page, KM_USER0);
3300                 memset(kaddr + offset, c, cur);
3301                 kunmap_atomic(kaddr, KM_USER0);
3302
3303                 len -= cur;
3304                 offset = 0;
3305                 i++;
3306         }
3307 }
3308 EXPORT_SYMBOL(memset_extent_buffer);
3309
3310 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3311                         unsigned long dst_offset, unsigned long src_offset,
3312                         unsigned long len)
3313 {
3314         u64 dst_len = dst->len;
3315         size_t cur;
3316         size_t offset;
3317         struct page *page;
3318         char *kaddr;
3319         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3320         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3321
3322         WARN_ON(src->len != dst_len);
3323
3324         offset = (start_offset + dst_offset) &
3325                 ((unsigned long)PAGE_CACHE_SIZE - 1);
3326
3327         while(len > 0) {
3328                 page = extent_buffer_page(dst, i);
3329                 WARN_ON(!PageUptodate(page));
3330
3331                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3332
3333                 kaddr = kmap_atomic(page, KM_USER0);
3334                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3335                 kunmap_atomic(kaddr, KM_USER0);
3336
3337                 src_offset += cur;
3338                 len -= cur;
3339                 offset = 0;
3340                 i++;
3341         }
3342 }
3343 EXPORT_SYMBOL(copy_extent_buffer);
3344
3345 static void move_pages(struct page *dst_page, struct page *src_page,
3346                        unsigned long dst_off, unsigned long src_off,
3347                        unsigned long len)
3348 {
3349         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3350         if (dst_page == src_page) {
3351                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3352         } else {
3353                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3354                 char *p = dst_kaddr + dst_off + len;
3355                 char *s = src_kaddr + src_off + len;
3356
3357                 while (len--)
3358                         *--p = *--s;
3359
3360                 kunmap_atomic(src_kaddr, KM_USER1);
3361         }
3362         kunmap_atomic(dst_kaddr, KM_USER0);
3363 }
3364
3365 static void copy_pages(struct page *dst_page, struct page *src_page,
3366                        unsigned long dst_off, unsigned long src_off,
3367                        unsigned long len)
3368 {
3369         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3370         char *src_kaddr;
3371
3372         if (dst_page != src_page)
3373                 src_kaddr = kmap_atomic(src_page, KM_USER1);
3374         else
3375                 src_kaddr = dst_kaddr;
3376
3377         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3378         kunmap_atomic(dst_kaddr, KM_USER0);
3379         if (dst_page != src_page)
3380                 kunmap_atomic(src_kaddr, KM_USER1);
3381 }
3382
3383 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3384                            unsigned long src_offset, unsigned long len)
3385 {
3386         size_t cur;
3387         size_t dst_off_in_page;
3388         size_t src_off_in_page;