Btrfs: Index extent buffers in an rbtree
[sfrench/cifs-2.6.git] / fs / btrfs / extent_io.c
index cfc383c17a3a13e3877d94bd79bf4e58922b3e7f..32bb4ed3723deed4c3eeae52654293fddc3ffe92 100644 (file)
@@ -91,29 +91,16 @@ void extent_io_tree_init(struct extent_io_tree *tree,
                          struct address_space *mapping, gfp_t mask)
 {
        tree->state.rb_node = NULL;
+       tree->buffer.rb_node = NULL;
        tree->ops = NULL;
        tree->dirty_bytes = 0;
        spin_lock_init(&tree->lock);
-       spin_lock_init(&tree->lru_lock);
+       spin_lock_init(&tree->buffer_lock);
        tree->mapping = mapping;
-       INIT_LIST_HEAD(&tree->buffer_lru);
-       tree->lru_size = 0;
        tree->last = NULL;
 }
 EXPORT_SYMBOL(extent_io_tree_init);
 
-void extent_io_tree_empty_lru(struct extent_io_tree *tree)
-{
-       struct extent_buffer *eb;
-       while(!list_empty(&tree->buffer_lru)) {
-               eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
-                               lru);
-               list_del_init(&eb->lru);
-               free_extent_buffer(eb);
-       }
-}
-EXPORT_SYMBOL(extent_io_tree_empty_lru);
-
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
        struct extent_state *state;
@@ -245,6 +232,50 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
        return ret;
 }
 
+static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
+                                         u64 offset, struct rb_node *node)
+{
+       struct rb_root *root = &tree->buffer;
+       struct rb_node ** p = &root->rb_node;
+       struct rb_node * parent = NULL;
+       struct extent_buffer *eb;
+
+       while(*p) {
+               parent = *p;
+               eb = rb_entry(parent, struct extent_buffer, rb_node);
+
+               if (offset < eb->start)
+                       p = &(*p)->rb_left;
+               else if (offset > eb->start)
+                       p = &(*p)->rb_right;
+               else
+                       return eb;
+       }
+
+       rb_link_node(node, parent, p);
+       rb_insert_color(node, root);
+       return NULL;
+}
+
+static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+                                          u64 offset)
+{
+       struct rb_root *root = &tree->buffer;
+       struct rb_node * n = root->rb_node;
+       struct extent_buffer *eb;
+
+       while(n) {
+               eb = rb_entry(n, struct extent_buffer, rb_node);
+               if (offset < eb->start)
+                       n = n->rb_left;
+               else if (offset > eb->start)
+                       n = n->rb_right;
+               else
+                       return eb;
+       }
+       return NULL;
+}
+
 /*
  * utility function to look for merge candidates inside a given range.
  * Any extents with matching state are merged together into a single
@@ -793,6 +824,13 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_dirty);
 
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                      gfp_t mask)
+{
+       return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_ordered);
+
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask)
 {
@@ -812,8 +850,8 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
 {
        return set_extent_bit(tree, start, end,
-                             EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
-                             mask);
+                             EXTENT_DELALLOC | EXTENT_DIRTY,
+                             0, NULL, mask);
 }
 EXPORT_SYMBOL(set_extent_delalloc);
 
@@ -825,6 +863,13 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(clear_extent_dirty);
 
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                        gfp_t mask)
+{
+       return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_ordered);
+
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
 {
@@ -1025,7 +1070,8 @@ u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 search_again:
        node = tree_search(tree, cur_start);
        if (!node) {
-               *end = (u64)-1;
+               if (!found)
+                       *end = (u64)-1;
                goto out;
        }
 
@@ -1365,7 +1411,7 @@ static int end_bio_extent_writepage(struct bio *bio,
                                   unsigned int bytes_done, int err)
 #endif
 {
-       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       int uptodate = err == 0;
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct extent_state *state = bio->bi_private;
        struct extent_io_tree *tree = state->tree;
@@ -1374,6 +1420,7 @@ static int end_bio_extent_writepage(struct bio *bio,
        u64 end;
        u64 cur;
        int whole_page;
+       int ret;
        unsigned long flags;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
@@ -1393,6 +1440,23 @@ static int end_bio_extent_writepage(struct bio *bio,
 
                if (--bvec >= bio->bi_io_vec)
                        prefetchw(&bvec->bv_page->flags);
+               if (tree->ops && tree->ops->writepage_end_io_hook) {
+                       ret = tree->ops->writepage_end_io_hook(page, start,
+                                                      end, state, uptodate);
+                       if (ret)
+                               uptodate = 0;
+               }
+
+               if (!uptodate && tree->ops &&
+                   tree->ops->writepage_io_failed_hook) {
+                       ret = tree->ops->writepage_io_failed_hook(bio, page,
+                                                        start, end, state);
+                       if (ret == 0) {
+                               state = NULL;
+                               uptodate = (err == 0);
+                               continue;
+                       }
+               }
 
                if (!uptodate) {
                        clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
@@ -1400,11 +1464,6 @@ static int end_bio_extent_writepage(struct bio *bio,
                        SetPageError(page);
                }
 
-               if (tree->ops && tree->ops->writepage_end_io_hook) {
-                       tree->ops->writepage_end_io_hook(page, start, end,
-                                                        state);
-               }
-
                /*
                 * bios can get merged in funny ways, and so we need to
                 * be careful with the state variable.  We know the
@@ -1534,6 +1593,17 @@ static int end_bio_extent_readpage(struct bio *bio,
                        if (ret)
                                uptodate = 0;
                }
+               if (!uptodate && tree->ops &&
+                   tree->ops->readpage_io_failed_hook) {
+                       ret = tree->ops->readpage_io_failed_hook(bio, page,
+                                                        start, end, state);
+                       if (ret == 0) {
+                               state = NULL;
+                               uptodate =
+                                       test_bit(BIO_UPTODATE, &bio->bi_flags);
+                               continue;
+                       }
+               }
 
                spin_lock_irqsave(&tree->lock, flags);
                if (!state || state->end != end) {
@@ -1548,8 +1618,9 @@ static int end_bio_extent_readpage(struct bio *bio,
                        }
                        if (!state) {
                                spin_unlock_irqrestore(&tree->lock, flags);
-                               set_extent_uptodate(tree, start, end,
-                                                   GFP_ATOMIC);
+                               if (uptodate)
+                                       set_extent_uptodate(tree, start, end,
+                                                           GFP_ATOMIC);
                                unlock_extent(tree, start, end, GFP_ATOMIC);
                                goto next_io;
                        }
@@ -1567,8 +1638,10 @@ static int end_bio_extent_readpage(struct bio *bio,
                        } else {
                                state = NULL;
                        }
-                       set_state_cb(tree, clear, EXTENT_UPTODATE);
-                       clear->state |= EXTENT_UPTODATE;
+                       if (uptodate) {
+                               set_state_cb(tree, clear, EXTENT_UPTODATE);
+                               clear->state |= EXTENT_UPTODATE;
+                       }
                        clear_state_bit(tree, clear, EXTENT_LOCKED,
                                        1, 0);
                        if (cur == start)
@@ -1679,15 +1752,15 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
        }
 
        if (bio) {
+               bio->bi_size = 0;
                bio->bi_bdev = bdev;
                bio->bi_sector = first_sector;
        }
        return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
 {
-       u64 maxsector;
        int ret = 0;
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct page *page = bvec->bv_page;
@@ -1715,14 +1788,9 @@ static int submit_one_bio(int rw, struct bio *bio)
 
        bio_get(bio);
 
-        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
-       if (maxsector < bio->bi_sector) {
-               printk("sector too large max %Lu got %llu\n", maxsector,
-                       (unsigned long long)bio->bi_sector);
-               WARN_ON(1);
-       }
        if (tree->ops && tree->ops->submit_bio_hook)
-               tree->ops->submit_bio_hook(page->mapping->host, rw, bio);
+               tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+                                          mirror_num);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1737,7 +1805,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                              struct block_device *bdev,
                              struct bio **bio_ret,
                              unsigned long max_pages,
-                             bio_end_io_t end_io_func)
+                             bio_end_io_t end_io_func,
+                             int mirror_num)
 {
        int ret = 0;
        struct bio *bio;
@@ -1749,7 +1818,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                    (tree->ops && tree->ops->merge_bio_hook &&
                     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
                    bio_add_page(bio, page, size, offset) < size) {
-                       ret = submit_one_bio(rw, bio);
+                       ret = submit_one_bio(rw, bio, mirror_num);
                        bio = NULL;
                } else {
                        return 0;
@@ -1769,7 +1838,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        if (bio_ret) {
                *bio_ret = bio;
        } else {
-               ret = submit_one_bio(rw, bio);
+               ret = submit_one_bio(rw, bio, mirror_num);
        }
 
        return ret;
@@ -1779,9 +1848,8 @@ void set_page_extent_mapped(struct page *page)
 {
        if (!PagePrivate(page)) {
                SetPagePrivate(page);
-               WARN_ON(!page->mapping->a_ops->invalidatepage);
-               set_page_private(page, EXTENT_PAGE_PRIVATE);
                page_cache_get(page);
+               set_page_private(page, EXTENT_PAGE_PRIVATE);
        }
 }
 
@@ -1798,7 +1866,7 @@ void set_page_extent_head(struct page *page, unsigned long len)
 static int __extent_read_full_page(struct extent_io_tree *tree,
                                   struct page *page,
                                   get_extent_t *get_extent,
-                                  struct bio **bio)
+                                  struct bio **bio, int mirror_num)
 {
        struct inode *inode = page->mapping->host;
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1843,9 +1911,14 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        unlock_extent(tree, cur, end, GFP_NOFS);
                        break;
                }
-
                extent_offset = cur - em->start;
+               if (extent_map_end(em) <= cur) {
+printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
+               }
                BUG_ON(extent_map_end(em) <= cur);
+               if (end < cur) {
+printk("2bad mapping end %Lu cur %Lu\n", end, cur);
+               }
                BUG_ON(end < cur);
 
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
@@ -1901,7 +1974,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        ret = submit_extent_page(READ, tree, page,
                                         sector, iosize, page_offset,
                                         bdev, bio, nr,
-                                        end_bio_extent_readpage);
+                                        end_bio_extent_readpage, mirror_num);
                }
                if (ret)
                        SetPageError(page);
@@ -1923,9 +1996,9 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        struct bio *bio = NULL;
        int ret;
 
-       ret = __extent_read_full_page(tree, page, get_extent, &bio);
+       ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
        if (bio)
-               submit_one_bio(READ, bio);
+               submit_one_bio(READ, bio, 0);
        return ret;
 }
 EXPORT_SYMBOL(extent_read_full_page);
@@ -1951,12 +2024,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 last_byte = i_size_read(inode);
        u64 block_start;
        u64 iosize;
+       u64 unlock_start;
        sector_t sector;
        struct extent_map *em;
        struct block_device *bdev;
        int ret;
        int nr = 0;
-       size_t page_offset = 0;
+       size_t pg_offset = 0;
        size_t blocksize;
        loff_t i_size = i_size_read(inode);
        unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1964,8 +2038,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 delalloc_end;
 
        WARN_ON(!PageLocked(page));
-       if (page->index > end_index) {
-               clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+       pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+       if (page->index > end_index ||
+          (page->index == end_index && !pg_offset)) {
+               page->mapping->a_ops->invalidatepage(page, 0);
                unlock_page(page);
                return 0;
        }
@@ -1973,13 +2049,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        if (page->index == end_index) {
                char *userpage;
 
-               size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
-
                userpage = kmap_atomic(page, KM_USER0);
-               memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
-               flush_dcache_page(page);
+               memset(userpage + pg_offset, 0,
+                      PAGE_CACHE_SIZE - pg_offset);
                kunmap_atomic(userpage, KM_USER0);
+               flush_dcache_page(page);
        }
+       pg_offset = 0;
 
        set_page_extent_mapped(page);
 
@@ -2002,6 +2078,17 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                delalloc_start = delalloc_end + 1;
        }
        lock_extent(tree, start, page_end, GFP_NOFS);
+       unlock_start = start;
+
+       if (tree->ops && tree->ops->writepage_start_hook) {
+               ret = tree->ops->writepage_start_hook(page, start, page_end);
+               if (ret == -EAGAIN) {
+                       unlock_extent(tree, start, page_end, GFP_NOFS);
+                       redirty_page_for_writepage(wbc, page);
+                       unlock_page(page);
+                       return 0;
+               }
+       }
 
        end = page_end;
        if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
@@ -2010,6 +2097,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
        if (last_byte <= start) {
                clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+               unlock_extent(tree, start, page_end, GFP_NOFS);
+               if (tree->ops && tree->ops->writepage_end_io_hook)
+                       tree->ops->writepage_end_io_hook(page, start,
+                                                        page_end, NULL, 1);
+               unlock_start = page_end + 1;
                goto done;
        }
 
@@ -2019,9 +2111,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        while (cur <= end) {
                if (cur >= last_byte) {
                        clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+                       unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+                       if (tree->ops && tree->ops->writepage_end_io_hook)
+                               tree->ops->writepage_end_io_hook(page, cur,
+                                                        page_end, NULL, 1);
+                       unlock_start = page_end + 1;
                        break;
                }
-               em = epd->get_extent(inode, page, page_offset, cur,
+               em = epd->get_extent(inode, page, pg_offset, cur,
                                     end - cur + 1, 1);
                if (IS_ERR(em) || !em) {
                        SetPageError(page);
@@ -2043,8 +2140,17 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                    block_start == EXTENT_MAP_INLINE) {
                        clear_extent_dirty(tree, cur,
                                           cur + iosize - 1, GFP_NOFS);
+
+                       unlock_extent(tree, unlock_start, cur + iosize -1,
+                                     GFP_NOFS);
+
+                       if (tree->ops && tree->ops->writepage_end_io_hook)
+                               tree->ops->writepage_end_io_hook(page, cur,
+                                                        cur + iosize - 1,
+                                                        NULL, 1);
                        cur = cur + iosize;
-                       page_offset += iosize;
+                       pg_offset += iosize;
+                       unlock_start = cur;
                        continue;
                }
 
@@ -2052,7 +2158,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
                                   EXTENT_DIRTY, 0)) {
                        cur = cur + iosize;
-                       page_offset += iosize;
+                       pg_offset += iosize;
                        continue;
                }
                clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
@@ -2062,10 +2168,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                } else {
                        ret = 0;
                }
-               if (ret)
+               if (ret) {
                        SetPageError(page);
-               else {
+               else {
                        unsigned long max_nr = end_index + 1;
+
                        set_range_writeback(tree, cur, cur + iosize - 1);
                        if (!PageWriteback(page)) {
                                printk("warning page %lu not writeback, "
@@ -2075,14 +2182,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        }
 
                        ret = submit_extent_page(WRITE, tree, page, sector,
-                                                iosize, page_offset, bdev,
+                                                iosize, pg_offset, bdev,
                                                 &epd->bio, max_nr,
-                                                end_bio_extent_writepage);
+                                                end_bio_extent_writepage, 0);
                        if (ret)
                                SetPageError(page);
                }
                cur = cur + iosize;
-               page_offset += iosize;
+               pg_offset += iosize;
                nr++;
        }
 done:
@@ -2091,13 +2198,13 @@ done:
                set_page_writeback(page);
                end_page_writeback(page);
        }
-       unlock_extent(tree, start, page_end, GFP_NOFS);
+       if (unlock_start <= page_end)
+               unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
        unlock_page(page);
        return 0;
 }
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
 /* Taken directly from 2.6.23 for 2.6.18 back port */
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                 void *data);
@@ -2244,7 +2351,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
        write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
        if (epd.bio) {
-               submit_one_bio(WRITE, epd.bio);
+               submit_one_bio(WRITE, epd.bio, 0);
        }
        return ret;
 }
@@ -2265,7 +2372,7 @@ int extent_writepages(struct extent_io_tree *tree,
 
        ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
        if (epd.bio) {
-               submit_one_bio(WRITE, epd.bio);
+               submit_one_bio(WRITE, epd.bio, 0);
        }
        return ret;
 }
@@ -2297,7 +2404,8 @@ int extent_readpages(struct extent_io_tree *tree,
                        page_cache_get(page);
                        if (!pagevec_add(&pvec, page))
                                __pagevec_lru_add(&pvec);
-                       __extent_read_full_page(tree, page, get_extent, &bio);
+                       __extent_read_full_page(tree, page, get_extent,
+                                               &bio, 0);
                }
                page_cache_release(page);
        }
@@ -2305,7 +2413,7 @@ int extent_readpages(struct extent_io_tree *tree,
                __pagevec_lru_add(&pvec);
        BUG_ON(!list_empty(pages));
        if (bio)
-               submit_one_bio(READ, bio);
+               submit_one_bio(READ, bio, 0);
        return 0;
 }
 EXPORT_SYMBOL(extent_readpages);
@@ -2430,7 +2538,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
                        ret = submit_extent_page(READ, tree, page,
                                         sector, iosize, page_offset, em->bdev,
                                         NULL, 1,
-                                        end_bio_extent_preparewrite);
+                                        end_bio_extent_preparewrite, 0);
                        iocount++;
                        block_start = block_start + iosize;
                } else {
@@ -2453,6 +2561,32 @@ err:
 }
 EXPORT_SYMBOL(extent_prepare_write);
 
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+int try_release_extent_state(struct extent_map_tree *map,
+                            struct extent_io_tree *tree, struct page *page,
+                            gfp_t mask)
+{
+       u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+       u64 end = start + PAGE_CACHE_SIZE - 1;
+       int ret = 1;
+
+       if (test_range_bit(tree, start, end,
+                          EXTENT_IOBITS | EXTENT_ORDERED, 0))
+               ret = 0;
+       else {
+               if ((mask & GFP_NOFS) == GFP_NOFS)
+                       mask = GFP_NOFS;
+               clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+                                1, 1, mask);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(try_release_extent_state);
+
 /*
  * a helper for releasepage.  As long as there are no locked extents
  * in the range corresponding to the page, both state records and extent
@@ -2465,8 +2599,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
        struct extent_map *em;
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 end = start + PAGE_CACHE_SIZE - 1;
-       u64 orig_start = start;
-       int ret = 1;
+
        if ((mask & __GFP_WAIT) &&
            page->mapping->host->i_size > 16 * 1024 * 1024) {
                u64 len;
@@ -2478,7 +2611,8 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                                spin_unlock(&map->lock);
                                break;
                        }
-                       if (em->start != start) {
+                       if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+                           em->start != start) {
                                spin_unlock(&map->lock);
                                free_extent_map(em);
                                break;
@@ -2497,15 +2631,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                        free_extent_map(em);
                }
        }
-       if (test_range_bit(tree, orig_start, end, EXTENT_IOBITS, 0))
-               ret = 0;
-       else {
-               if ((mask & GFP_NOFS) == GFP_NOFS)
-                       mask = GFP_NOFS;
-               clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
-                                1, 1, mask);
-       }
-       return ret;
+       return try_release_extent_state(map, tree, page, mask);
 }
 EXPORT_SYMBOL(try_release_extent_mapping);
 
@@ -2531,51 +2657,6 @@ out:
        return sector;
 }
 
-static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb)
-{
-       if (list_empty(&eb->lru)) {
-               extent_buffer_get(eb);
-               list_add(&eb->lru, &tree->buffer_lru);
-               tree->lru_size++;
-               if (tree->lru_size >= BUFFER_LRU_MAX) {
-                       struct extent_buffer *rm;
-                       rm = list_entry(tree->buffer_lru.prev,
-                                       struct extent_buffer, lru);
-                       tree->lru_size--;
-                       list_del_init(&rm->lru);
-                       free_extent_buffer(rm);
-               }
-       } else
-               list_move(&eb->lru, &tree->buffer_lru);
-       return 0;
-}
-static struct extent_buffer *find_lru(struct extent_io_tree *tree,
-                                     u64 start, unsigned long len)
-{
-       struct list_head *lru = &tree->buffer_lru;
-       struct list_head *cur = lru->next;
-       struct extent_buffer *eb;
-
-       if (list_empty(lru))
-               return NULL;
-
-       do {
-               eb = list_entry(cur, struct extent_buffer, lru);
-               if (eb->start == start && eb->len == len) {
-                       extent_buffer_get(eb);
-                       return eb;
-               }
-               cur = cur->next;
-       } while (cur != lru);
-       return NULL;
-}
-
-static inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-       return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-               (start >> PAGE_CACHE_SHIFT);
-}
-
 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                              unsigned long i)
 {
@@ -2592,44 +2673,10 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
        return p;
 }
 
-int release_extent_buffer_tail_pages(struct extent_buffer *eb)
-{
-       unsigned long num_pages = num_extent_pages(eb->start, eb->len);
-       struct page *page;
-       unsigned long i;
-
-       if (num_pages == 1)
-               return 0;
-       for (i = 1; i < num_pages; i++) {
-               page = extent_buffer_page(eb, i);
-               page_cache_release(page);
-       }
-       return 0;
-}
-
-
-int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
-                         unsigned long len)
+static inline unsigned long num_extent_pages(u64 start, u64 len)
 {
-       struct list_head *lru = &tree->buffer_lru;
-       struct list_head *cur = lru->next;
-       struct extent_buffer *eb;
-       int found = 0;
-
-       spin_lock(&tree->lru_lock);
-       if (list_empty(lru))
-               goto out;
-
-       do {
-               eb = list_entry(cur, struct extent_buffer, lru);
-               if (eb->start <= start && eb->start + eb->len > start) {
-                       eb->flags &= ~EXTENT_UPTODATE;
-               }
-               cur = cur->next;
-       } while (cur != lru);
-out:
-       spin_unlock(&tree->lru_lock);
-       return found;
+       return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+               (start >> PAGE_CACHE_SHIFT);
 }
 
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
@@ -2640,15 +2687,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        struct extent_buffer *eb = NULL;
        unsigned long flags;
 
-       spin_lock(&tree->lru_lock);
-       eb = find_lru(tree, start, len);
-       spin_unlock(&tree->lru_lock);
-       if (eb) {
-               return eb;
-       }
-
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
-       INIT_LIST_HEAD(&eb->lru);
        eb->start = start;
        eb->len = len;
        spin_lock_irqsave(&leak_lock, flags);
@@ -2677,17 +2716,24 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        unsigned long i;
        unsigned long index = start >> PAGE_CACHE_SHIFT;
        struct extent_buffer *eb;
+       struct extent_buffer *exists = NULL;
        struct page *p;
        struct address_space *mapping = tree->mapping;
        int uptodate = 1;
 
+       spin_lock(&tree->buffer_lock);
+       eb = buffer_search(tree, start);
+       if (eb) {
+               atomic_inc(&eb->refs);
+               spin_unlock(&tree->buffer_lock);
+               return eb;
+       }
+       spin_unlock(&tree->buffer_lock);
+
        eb = __alloc_extent_buffer(tree, start, len, mask);
        if (!eb)
                return NULL;
 
-       if (eb->flags & EXTENT_BUFFER_FILLED)
-               goto lru_add;
-
        if (page0) {
                eb->first_page = page0;
                i = 1;
@@ -2696,6 +2742,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                mark_page_accessed(page0);
                set_page_extent_mapped(page0);
                set_page_extent_head(page0, len);
+               uptodate = PageUptodate(page0);
        } else {
                i = 0;
        }
@@ -2703,7 +2750,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
                if (!p) {
                        WARN_ON(1);
-                       goto fail;
+                       goto free_eb;
                }
                set_page_extent_mapped(p);
                mark_page_accessed(p);
@@ -2721,25 +2768,28 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                eb->flags |= EXTENT_UPTODATE;
        eb->flags |= EXTENT_BUFFER_FILLED;
 
-lru_add:
-       spin_lock(&tree->lru_lock);
-       add_lru(tree, eb);
-       spin_unlock(&tree->lru_lock);
+       spin_lock(&tree->buffer_lock);
+       exists = buffer_tree_insert(tree, start, &eb->rb_node);
+       if (exists) {
+               /* add one reference for the caller */
+               atomic_inc(&exists->refs);
+               spin_unlock(&tree->buffer_lock);
+               goto free_eb;
+       }
+       spin_unlock(&tree->buffer_lock);
+
+       /* add one reference for the tree */
+       atomic_inc(&eb->refs);
        return eb;
 
-fail:
-       spin_lock(&tree->lru_lock);
-       list_del_init(&eb->lru);
-       spin_unlock(&tree->lru_lock);
+free_eb:
        if (!atomic_dec_and_test(&eb->refs))
-               return NULL;
-       for (index = 1; index < i; index++) {
+               return exists;
+       for (index = 1; index < i; index++)
                page_cache_release(extent_buffer_page(eb, index));
-       }
-       if (i > 0)
-               page_cache_release(extent_buffer_page(eb, 0));
+       page_cache_release(extent_buffer_page(eb, 0));
        __free_extent_buffer(eb);
-       return NULL;
+       return exists;
 }
 EXPORT_SYMBOL(alloc_extent_buffer);
 
@@ -2747,84 +2797,27 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                         u64 start, unsigned long len,
                                          gfp_t mask)
 {
-       unsigned long num_pages = num_extent_pages(start, len);
-       unsigned long i;
-       unsigned long index = start >> PAGE_CACHE_SHIFT;
        struct extent_buffer *eb;
-       struct page *p;
-       struct address_space *mapping = tree->mapping;
-       int uptodate = 1;
-
-       eb = __alloc_extent_buffer(tree, start, len, mask);
-       if (!eb)
-               return NULL;
 
-       if (eb->flags & EXTENT_BUFFER_FILLED)
-               goto lru_add;
+       spin_lock(&tree->buffer_lock);
+       eb = buffer_search(tree, start);
+       if (eb)
+               atomic_inc(&eb->refs);
+       spin_unlock(&tree->buffer_lock);
 
-       for (i = 0; i < num_pages; i++, index++) {
-               p = find_lock_page(mapping, index);
-               if (!p) {
-                       goto fail;
-               }
-               set_page_extent_mapped(p);
-               mark_page_accessed(p);
-
-               if (i == 0) {
-                       eb->first_page = p;
-                       set_page_extent_head(p, len);
-               } else {
-                       set_page_private(p, EXTENT_PAGE_PRIVATE);
-               }
-
-               if (!PageUptodate(p))
-                       uptodate = 0;
-               unlock_page(p);
-       }
-       if (uptodate)
-               eb->flags |= EXTENT_UPTODATE;
-       eb->flags |= EXTENT_BUFFER_FILLED;
-
-lru_add:
-       spin_lock(&tree->lru_lock);
-       add_lru(tree, eb);
-       spin_unlock(&tree->lru_lock);
        return eb;
-fail:
-       spin_lock(&tree->lru_lock);
-       list_del_init(&eb->lru);
-       spin_unlock(&tree->lru_lock);
-       if (!atomic_dec_and_test(&eb->refs))
-               return NULL;
-       for (index = 1; index < i; index++) {
-               page_cache_release(extent_buffer_page(eb, index));
-       }
-       if (i > 0)
-               page_cache_release(extent_buffer_page(eb, 0));
-       __free_extent_buffer(eb);
-       return NULL;
 }
 EXPORT_SYMBOL(find_extent_buffer);
 
 void free_extent_buffer(struct extent_buffer *eb)
 {
-       unsigned long i;
-       unsigned long num_pages;
-
        if (!eb)
                return;
 
        if (!atomic_dec_and_test(&eb->refs))
                return;
 
-       WARN_ON(!list_empty(&eb->lru));
-       num_pages = num_extent_pages(eb->start, eb->len);
-
-       for (i = 1; i < num_pages; i++) {
-               page_cache_release(extent_buffer_page(eb, i));
-       }
-       page_cache_release(extent_buffer_page(eb, 0));
-       __free_extent_buffer(eb);
+       WARN_ON(1);
 }
 EXPORT_SYMBOL(free_extent_buffer);
 
@@ -2844,7 +2837,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               lock_page(page);
                if (i == 0)
                        set_page_extent_head(page, eb->len);
                else
@@ -2862,7 +2854,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                        end  = start + PAGE_CACHE_SIZE - 1;
                        if (test_range_bit(tree, start, end,
                                           EXTENT_DIRTY, 0)) {
-                               unlock_page(page);
                                continue;
                        }
                }
@@ -2874,7 +2865,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                                                PAGECACHE_TAG_DIRTY);
                }
                read_unlock_irq(&page->mapping->tree_lock);
-               unlock_page(page);
        }
        return 0;
 }
@@ -2903,23 +2893,37 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
                 * on us if the page isn't already dirty.
                 */
                if (i == 0) {
-                       lock_page(page);
                        set_page_extent_head(page, eb->len);
                } else if (PagePrivate(page) &&
                           page->private != EXTENT_PAGE_PRIVATE) {
-                       lock_page(page);
                        set_page_extent_mapped(page);
-                       unlock_page(page);
                }
                __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-               if (i == 0)
-                       unlock_page(page);
        }
        return set_extent_dirty(tree, eb->start,
                                eb->start + eb->len - 1, GFP_NOFS);
 }
 EXPORT_SYMBOL(set_extent_buffer_dirty);
 
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+                               struct extent_buffer *eb)
+{
+       unsigned long i;
+       struct page *page;
+       unsigned long num_pages;
+
+       num_pages = num_extent_pages(eb->start, eb->len);
+       eb->flags &= ~EXTENT_UPTODATE;
+
+       clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                             GFP_NOFS);
+       for (i = 0; i < num_pages; i++) {
+               page = extent_buffer_page(eb, i);
+               ClearPageUptodate(page);
+       }
+       return 0;
+}
+
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                                struct extent_buffer *eb)
 {
@@ -2975,17 +2979,18 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
                           struct extent_buffer *eb)
 {
        int ret = 0;
-       int ret2;
        unsigned long num_pages;
        unsigned long i;
        struct page *page;
        int pg_uptodate = 1;
 
        if (eb->flags & EXTENT_UPTODATE)
-               ret = 1;
+               return 1;
 
-       ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+       ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
                           EXTENT_UPTODATE, 1);
+       if (ret)
+               return ret;
 
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
@@ -2995,18 +3000,14 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
                        break;
                }
        }
-       if ((ret || ret2) && !pg_uptodate) {
-printk("uptodate error2 eb %Lu ret %d ret2 %d pg_uptodate %d\n", eb->start, ret, ret2, pg_uptodate);
-               WARN_ON(1);
-       }
-       return (ret || ret2);
+       return pg_uptodate;
 }
 EXPORT_SYMBOL(extent_buffer_uptodate);
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb,
                             u64 start, int wait,
-                            get_extent_t *get_extent)
+                            get_extent_t *get_extent, int mirror_num)
 {
        unsigned long i;
        unsigned long start_i;
@@ -3062,8 +3063,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                if (!PageUptodate(page)) {
                        if (start_i == 0)
                                inc_all_pages = 1;
+                       ClearPageError(page);
                        err = __extent_read_full_page(tree, page,
-                                                     get_extent, &bio);
+                                                     get_extent, &bio,
+                                                     mirror_num);
                        if (err) {
                                ret = err;
                        }
@@ -3073,7 +3076,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        }
 
        if (bio)
-               submit_one_bio(READ, bio);
+               submit_one_bio(READ, bio, mirror_num);
 
        if (ret || !wait) {
                return ret;
@@ -3471,3 +3474,35 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        }
 }
 EXPORT_SYMBOL(memmove_extent_buffer);
+
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+{
+       u64 start = page_offset(page);
+       struct extent_buffer *eb;
+       int ret = 1;
+       unsigned long i;
+       unsigned long num_pages;
+
+       spin_lock(&tree->buffer_lock);
+       eb = buffer_search(tree, start);
+       if (!eb)
+               goto out;
+
+       if (atomic_read(&eb->refs) > 1) {
+               ret = 0;
+               goto out;
+       }
+       /* at this point we can safely release the extent buffer */
+       num_pages = num_extent_pages(eb->start, eb->len);
+       for (i = 0; i < num_pages; i++) {
+               struct page *page = extent_buffer_page(eb, i);
+               page_cache_release(page);
+       }
+       rb_erase(&eb->rb_node, &tree->buffer);
+       __free_extent_buffer(eb);
+out:
+       spin_unlock(&tree->buffer_lock);
+       return ret;
+}
+EXPORT_SYMBOL(try_release_extent_buffer);
+