Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfashe...
authorLinus Torvalds <torvalds@woody.linux-foundation.org>
Mon, 16 Jul 2007 17:52:55 +0000 (10:52 -0700)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Mon, 16 Jul 2007 17:52:55 +0000 (10:52 -0700)
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (32 commits)
  [PATCH] ocfs2: zero_user_page conversion
  ocfs2: Support xfs style space reservation ioctls
  ocfs2: support for removing file regions
  ocfs2: update truncate handling of partial clusters
  ocfs2: btree support for removal of arbirtrary extents
  ocfs2: Support creation of unwritten extents
  ocfs2: support writing of unwritten extents
  ocfs2: small cleanup of ocfs2_write_begin_nolock()
  ocfs2: btree changes for unwritten extents
  ocfs2: abstract btree growing calls
  ocfs2: use all extent block suballocators
  ocfs2: plug truncate into cached dealloc routines
  ocfs2: simplify deallocation locking
  ocfs2: harden buffer check during mapping of page blocks
  ocfs2: shared writeable mmap
  ocfs2: factor out write aops into nolock variants
  ocfs2: rework ocfs2_buffered_write_cluster()
  ocfs2: take ip_alloc_sem during entire truncate
  ocfs2: Add "preferred slot" mount option
  [KJ PATCH] Replacing memset(<addr>,0,PAGE_SIZE) with clear_page() in fs/ocfs2/dlm/dlmrecovery.c
  ...

39 files changed:
Documentation/filesystems/configfs/configfs.txt
Documentation/filesystems/configfs/configfs_example.c
fs/configfs/configfs_internal.h
fs/configfs/dir.c
fs/configfs/file.c
fs/configfs/item.c
fs/dlm/config.c
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/cluster/heartbeat.h
fs/ocfs2/cluster/nodemanager.c
fs/ocfs2/cluster/nodemanager.h
fs/ocfs2/cluster/tcp.c
fs/ocfs2/dir.c
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlmglue.c
fs/ocfs2/endian.h
fs/ocfs2/extent_map.c
fs/ocfs2/file.c
fs/ocfs2/file.h
fs/ocfs2/heartbeat.c
fs/ocfs2/ioctl.c
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/mmap.c
fs/ocfs2/namei.c
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/slot_map.c
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/ocfs2/super.h
include/linux/configfs.h

index b34cdb50eab466ca539a559228c5cfc6a18e6bde..d1b98257d00063db10075d54180b7497bcc99d41 100644 (file)
@@ -238,6 +238,8 @@ config_item_type.
                struct config_group *(*make_group)(struct config_group *group,
                                                   const char *name);
                int (*commit_item)(struct config_item *item);
+               void (*disconnect_notify)(struct config_group *group,
+                                         struct config_item *item);
                void (*drop_item)(struct config_group *group,
                                  struct config_item *item);
        };
@@ -268,6 +270,16 @@ the item in other threads, the memory is safe.  It may take some time
 for the item to actually disappear from the subsystem's usage.  But it
 is gone from configfs.
 
+When drop_item() is called, the item's linkage has already been torn
+down.  It no longer has a reference on its parent and has no place in
+the item hierarchy.  If a client needs to do some cleanup before this
+teardown happens, the subsystem can implement the
+ct_group_ops->disconnect_notify() method.  The method is called after
+configfs has removed the item from the filesystem view but before the
+item is removed from its parent group.  Like drop_item(),
+disconnect_notify() is void and cannot fail.  Client subsystems should
+not drop any references here, as they still must do it in drop_item().
+
 A config_group cannot be removed while it still has child items.  This
 is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
 called, as the item has not been dropped.  rmdir(2) will fail, as the
@@ -280,18 +292,18 @@ tells configfs to make the subsystem appear in the file tree.
 
        struct configfs_subsystem {
                struct config_group     su_group;
-               struct semaphore        su_sem;
+               struct mutex            su_mutex;
        };
 
        int configfs_register_subsystem(struct configfs_subsystem *subsys);
        void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
 
-       A subsystem consists of a toplevel config_group and a semaphore.
+       A subsystem consists of a toplevel config_group and a mutex.
 The group is where child config_items are created.  For a subsystem,
 this group is usually defined statically.  Before calling
 configfs_register_subsystem(), the subsystem must have initialized the
 group via the usual group _init() functions, and it must also have
-initialized the semaphore.
+initialized the mutex.
        When the register call returns, the subsystem is live, and it
 will be visible via configfs.  At that point, mkdir(2) can be called and
 the subsystem must be ready for it.
@@ -303,7 +315,7 @@ subsystem/group and the simple_child item in configfs_example.c  It
 shows a trivial object displaying and storing an attribute, and a simple
 group creating and destroying these children.
 
-[Hierarchy Navigation and the Subsystem Semaphore]
+[Hierarchy Navigation and the Subsystem Mutex]
 
 There is an extra bonus that configfs provides.  The config_groups and
 config_items are arranged in a hierarchy due to the fact that they
@@ -314,19 +326,19 @@ and config_item->ci_parent structure members.
 
 A subsystem can navigate the cg_children list and the ci_parent pointer
 to see the tree created by the subsystem.  This can race with configfs'
-management of the hierarchy, so configfs uses the subsystem semaphore to
+management of the hierarchy, so configfs uses the subsystem mutex to
 protect modifications.  Whenever a subsystem wants to navigate the
 hierarchy, it must do so under the protection of the subsystem
-semaphore.
+mutex.
 
-A subsystem will be prevented from acquiring the semaphore while a newly
+A subsystem will be prevented from acquiring the mutex while a newly
 allocated item has not been linked into this hierarchy.   Similarly, it
-will not be able to acquire the semaphore while a dropping item has not
+will not be able to acquire the mutex while a dropping item has not
 yet been unlinked.  This means that an item's ci_parent pointer will
 never be NULL while the item is in configfs, and that an item will only
 be in its parent's cg_children list for the same duration.  This allows
 a subsystem to trust ci_parent and cg_children while they hold the
-semaphore.
+mutex.
 
 [Item Aggregation Via symlink(2)]
 
@@ -386,6 +398,33 @@ As a consequence of this, default_groups cannot be removed directly via
 rmdir(2).  They also are not considered when rmdir(2) on the parent
 group is checking for children.
 
+[Dependant Subsystems]
+
+Sometimes other drivers depend on particular configfs items.  For
+example, ocfs2 mounts depend on a heartbeat region item.  If that
+region item is removed with rmdir(2), the ocfs2 mount must BUG or go
+readonly.  Not happy.
+
+configfs provides two additional API calls: configfs_depend_item() and
+configfs_undepend_item().  A client driver can call
+configfs_depend_item() on an existing item to tell configfs that it is
+depended on.  configfs will then return -EBUSY from rmdir(2) for that
+item.  When the item is no longer depended on, the client driver calls
+configfs_undepend_item() on it.
+
+These API cannot be called underneath any configfs callbacks, as
+they will conflict.  They can block and allocate.  A client driver
+probably shouldn't calling them of its own gumption.  Rather it should
+be providing an API that external subsystems call.
+
+How does this work?  Imagine the ocfs2 mount process.  When it mounts,
+it asks for a heartbeat region item.  This is done via a call into the
+heartbeat code.  Inside the heartbeat code, the region item is looked
+up.  Here, the heartbeat code calls configfs_depend_item().  If it
+succeeds, then heartbeat knows the region is safe to give to ocfs2.
+If it fails, it was being torn down anyway, and heartbeat can gracefully
+pass up an error.
+
 [Committable Items]
 
 NOTE: Committable items are currently unimplemented.
index 2d6a14a463e072935dcf78a3492693cf230708af..e56d49264b3938201e8bc8689157b8b66040f883 100644 (file)
@@ -453,7 +453,7 @@ static int __init configfs_example_init(void)
                subsys = example_subsys[i];
 
                config_group_init(&subsys->su_group);
-               init_MUTEX(&subsys->su_sem);
+               mutex_init(&subsys->su_mutex);
                ret = configfs_register_subsystem(subsys);
                if (ret) {
                        printk(KERN_ERR "Error %d while registering subsystem %s\n",
index 7b48c034b312128cbf64fe2094aa7155b5323fdb..3b0185fdf9a4c8834dc0e669a68cffe84ec12ece 100644 (file)
 
 struct configfs_dirent {
        atomic_t                s_count;
+       int                     s_dependent_count;
        struct list_head        s_sibling;
        struct list_head        s_children;
        struct list_head        s_links;
-       void                    * s_element;
+       void                    * s_element;
        int                     s_type;
        umode_t                 s_mode;
        struct dentry           * s_dentry;
@@ -41,8 +42,8 @@ struct configfs_dirent {
 
 #define CONFIGFS_ROOT          0x0001
 #define CONFIGFS_DIR           0x0002
-#define CONFIGFS_ITEM_ATTR     0x0004
-#define CONFIGFS_ITEM_LINK     0x0020
+#define CONFIGFS_ITEM_ATTR     0x0004
+#define CONFIGFS_ITEM_LINK     0x0020
 #define CONFIGFS_USET_DIR      0x0040
 #define CONFIGFS_USET_DEFAULT  0x0080
 #define CONFIGFS_USET_DROPPING 0x0100
index 5e6e37e58f36f5369201f0bb0482957beab20fdc..2f436d4f1d6db5ab11eacf6c5f3a766f70b03780 100644 (file)
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry)
                        /* Mark that we've taken i_mutex */
                        sd->s_type |= CONFIGFS_USET_DROPPING;
 
+                       /*
+                        * Yup, recursive.  If there's a problem, blame
+                        * deep nesting of default_groups
+                        */
                        ret = configfs_detach_prep(sd->s_dentry);
                        if (!ret)
                                continue;
@@ -562,7 +566,7 @@ static int populate_groups(struct config_group *group)
 
 /*
  * All of link_obj/unlink_obj/link_group/unlink_group require that
- * subsys->su_sem is held.
+ * subsys->su_mutex is held.
  */
 
 static void unlink_obj(struct config_item *item)
@@ -713,6 +717,28 @@ static void configfs_detach_group(struct config_item *item)
        configfs_detach_item(item);
 }
 
+/*
+ * After the item has been detached from the filesystem view, we are
+ * ready to tear it out of the hierarchy.  Notify the client before
+ * we do that so they can perform any cleanup that requires
+ * navigating the hierarchy.  A client does not need to provide this
+ * callback.  The subsystem semaphore MUST be held by the caller, and
+ * references must be valid for both items.  It also assumes the
+ * caller has validated ci_type.
+ */
+static void client_disconnect_notify(struct config_item *parent_item,
+                                    struct config_item *item)
+{
+       struct config_item_type *type;
+
+       type = parent_item->ci_type;
+       BUG_ON(!type);
+
+       if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
+               type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
+                                                     item);
+}
+
 /*
  * Drop the initial reference from make_item()/make_group()
  * This function assumes that reference is held on item
@@ -733,11 +759,244 @@ static void client_drop_item(struct config_item *parent_item,
         */
        if (type->ct_group_ops && type->ct_group_ops->drop_item)
                type->ct_group_ops->drop_item(to_config_group(parent_item),
-                                               item);
+                                             item);
        else
                config_item_put(item);
 }
 
+#ifdef DEBUG
+static void configfs_dump_one(struct configfs_dirent *sd, int level)
+{
+       printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+
+#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
+       type_print(CONFIGFS_ROOT);
+       type_print(CONFIGFS_DIR);
+       type_print(CONFIGFS_ITEM_ATTR);
+       type_print(CONFIGFS_ITEM_LINK);
+       type_print(CONFIGFS_USET_DIR);
+       type_print(CONFIGFS_USET_DEFAULT);
+       type_print(CONFIGFS_USET_DROPPING);
+#undef type_print
+}
+
+static int configfs_dump(struct configfs_dirent *sd, int level)
+{
+       struct configfs_dirent *child_sd;
+       int ret = 0;
+
+       configfs_dump_one(sd, level);
+
+       if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
+               return 0;
+
+       list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+               ret = configfs_dump(child_sd, level + 2);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+#endif
+
+
+/*
+ * configfs_depend_item() and configfs_undepend_item()
+ *
+ * WARNING: Do not call these from a configfs callback!
+ *
+ * This describes these functions and their helpers.
+ *
+ * Allow another kernel system to depend on a config_item.  If this
+ * happens, the item cannot go away until the dependant can live without
+ * it.  The idea is to give client modules as simple an interface as
+ * possible.  When a system asks them to depend on an item, they just
+ * call configfs_depend_item().  If the item is live and the client
+ * driver is in good shape, we'll happily do the work for them.
+ *
+ * Why is the locking complex?  Because configfs uses the VFS to handle
+ * all locking, but this function is called outside the normal
+ * VFS->configfs path.  So it must take VFS locks to prevent the
+ * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc).  This is
+ * why you can't call these functions underneath configfs callbacks.
+ *
+ * Note, btw, that this can be called at *any* time, even when a configfs
+ * subsystem isn't registered, or when configfs is loading or unloading.
+ * Just like configfs_register_subsystem().  So we take the same
+ * precautions.  We pin the filesystem.  We lock each i_mutex _in_order_
+ * on our way down the tree.  If we can find the target item in the
+ * configfs tree, it must be part of the subsystem tree as well, so we
+ * do not need the subsystem semaphore.  Holding the i_mutex chain locks
+ * out mkdir() and rmdir(), who might be racing us.
+ */
+
+/*
+ * configfs_depend_prep()
+ *
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes.  This is similar but not the same to configfs_detach_prep().
+ * Note that configfs_detach_prep() expects the parent to be locked when it
+ * is called, but we lock the parent *inside* configfs_depend_prep().  We
+ * do that so we can unlock it if we find nothing.
+ *
+ * Here we do a depth-first search of the dentry hierarchy looking for
+ * our object.  We take i_mutex on each step of the way down.  IT IS
+ * ESSENTIAL THAT i_mutex LOCKING IS ORDERED.  If we come back up a branch,
+ * we'll drop the i_mutex.
+ *
+ * If the target is not found, -ENOENT is bubbled up and we have released
+ * all locks.  If the target was found, the locks will be cleared by
+ * configfs_depend_rollback().
+ *
+ * This adds a requirement that all config_items be unique!
+ *
+ * This is recursive because the locking traversal is tricky.  There isn't
+ * much on the stack, though, so folks that need this function - be careful
+ * about your stack!  Patches will be accepted to make it iterative.
+ */
+static int configfs_depend_prep(struct dentry *origin,
+                               struct config_item *target)
+{
+       struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+       int ret = 0;
+
+       BUG_ON(!origin || !sd);
+
+       /* Lock this guy on the way down */
+       mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+       if (sd->s_element == target)  /* Boo-yah */
+               goto out;
+
+       list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+               if (child_sd->s_type & CONFIGFS_DIR) {
+                       ret = configfs_depend_prep(child_sd->s_dentry,
+                                                  target);
+                       if (!ret)
+                               goto out;  /* Child path boo-yah */
+               }
+       }
+
+       /* We looped all our children and didn't find target */
+       mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
+       ret = -ENOENT;
+
+out:
+       return ret;
+}
+
+/*
+ * This is ONLY called if configfs_depend_prep() did its job.  So we can
+ * trust the entire path from item back up to origin.
+ *
+ * We walk backwards from item, unlocking each i_mutex.  We finish by
+ * unlocking origin.
+ */
+static void configfs_depend_rollback(struct dentry *origin,
+                                    struct config_item *item)
+{
+       struct dentry *dentry = item->ci_dentry;
+
+       while (dentry != origin) {
+               mutex_unlock(&dentry->d_inode->i_mutex);
+               dentry = dentry->d_parent;
+       }
+
+       mutex_unlock(&origin->d_inode->i_mutex);
+}
+
+int configfs_depend_item(struct configfs_subsystem *subsys,
+                        struct config_item *target)
+{
+       int ret;
+       struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+       struct config_item *s_item = &subsys->su_group.cg_item;
+
+       /*
+        * Pin the configfs filesystem.  This means we can safely access
+        * the root of the configfs filesystem.
+        */
+       ret = configfs_pin_fs();
+       if (ret)
+               return ret;
+
+       /*
+        * Next, lock the root directory.  We're going to check that the
+        * subsystem is really registered, and so we need to lock out
+        * configfs_[un]register_subsystem().
+        */
+       mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+
+       root_sd = configfs_sb->s_root->d_fsdata;
+
+       list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+               if (p->s_type & CONFIGFS_DIR) {
+                       if (p->s_element == s_item) {
+                               subsys_sd = p;
+                               break;
+                       }
+               }
+       }
+
+       if (!subsys_sd) {
+               ret = -ENOENT;
+               goto out_unlock_fs;
+       }
+
+       /* Ok, now we can trust subsys/s_item */
+
+       /* Scan the tree, locking i_mutex recursively, return 0 if found */
+       ret = configfs_depend_prep(subsys_sd->s_dentry, target);
+       if (ret)
+               goto out_unlock_fs;
+
+       /* We hold all i_mutexes from the subsystem down to the target */
+       p = target->ci_dentry->d_fsdata;
+       p->s_dependent_count += 1;
+
+       configfs_depend_rollback(subsys_sd->s_dentry, target);
+
+out_unlock_fs:
+       mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+
+       /*
+        * If we succeeded, the fs is pinned via other methods.  If not,
+        * we're done with it anyway.  So release_fs() is always right.
+        */
+       configfs_release_fs();
+
+       return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item);
+
+/*
+ * Release the dependent linkage.  This is much simpler than
+ * configfs_depend_item() because we know that that the client driver is
+ * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
+ */
+void configfs_undepend_item(struct configfs_subsystem *subsys,
+                           struct config_item *target)
+{
+       struct configfs_dirent *sd;
+
+       /*
+        * Since we can trust everything is pinned, we just need i_mutex
+        * on the item.
+        */
+       mutex_lock(&target->ci_dentry->d_inode->i_mutex);
+
+       sd = target->ci_dentry->d_fsdata;
+       BUG_ON(sd->s_dependent_count < 1);
+
+       sd->s_dependent_count -= 1;
+
+       /*
+        * After this unlock, we cannot trust the item to stay alive!
+        * DO NOT REFERENCE item after this unlock.
+        */
+       mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
+}
+EXPORT_SYMBOL(configfs_undepend_item);
 
 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
@@ -783,7 +1042,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
        snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
 
-       down(&subsys->su_sem);
+       mutex_lock(&subsys->su_mutex);
        group = NULL;
        item = NULL;
        if (type->ct_group_ops->make_group) {
@@ -797,7 +1056,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                if (item)
                        link_obj(parent_item, item);
        }
-       up(&subsys->su_sem);
+       mutex_unlock(&subsys->su_mutex);
 
        kfree(name);
        if (!item) {
@@ -841,13 +1100,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_unlink:
        if (ret) {
                /* Tear down everything we built up */
-               down(&subsys->su_sem);
+               mutex_lock(&subsys->su_mutex);
+
+               client_disconnect_notify(parent_item, item);
                if (group)
                        unlink_group(group);
                else
                        unlink_obj(item);
                client_drop_item(parent_item, item);
-               up(&subsys->su_sem);
+
+               mutex_unlock(&subsys->su_mutex);
 
                if (module_got)
                        module_put(owner);
@@ -881,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (sd->s_type & CONFIGFS_USET_DEFAULT)
                return -EPERM;
 
+       /*
+        * Here's where we check for dependents.  We're protected by
+        * i_mutex.
+        */
+       if (sd->s_dependent_count)
+               return -EBUSY;
+
        /* Get a working ref until we have the child */
        parent_item = configfs_get_config_item(dentry->d_parent);
        subsys = to_config_group(parent_item)->cg_subsys;
@@ -910,17 +1179,19 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (sd->s_type & CONFIGFS_USET_DIR) {
                configfs_detach_group(item);
 
-               down(&subsys->su_sem);
+               mutex_lock(&subsys->su_mutex);
+               client_disconnect_notify(parent_item, item);
                unlink_group(to_config_group(item));
        } else {
                configfs_detach_item(item);
 
-               down(&subsys->su_sem);
+               mutex_lock(&subsys->su_mutex);
+               client_disconnect_notify(parent_item, item);
                unlink_obj(item);
        }
 
        client_drop_item(parent_item, item);
-       up(&subsys->su_sem);
+       mutex_unlock(&subsys->su_mutex);
 
        /* Drop our reference from above */
        config_item_put(item);
index 3527c7c6def898cb39d1118738b8d7a89475edc7..a3658f9a082c691cc11c22de9177e253edbc5cf9 100644 (file)
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
 
+/*
+ * A simple attribute can only be 4096 characters.  Why 4k?  Because the
+ * original code limited it to PAGE_SIZE.  That's a bad idea, though,
+ * because an attribute of 16k on ia64 won't work on x86.  So we limit to
+ * 4k, our minimum common page size.
+ */
+#define SIMPLE_ATTR_SIZE 4096
 
 struct configfs_buffer {
        size_t                  count;
        loff_t                  pos;
        char                    * page;
        struct configfs_item_operations * ops;
-       struct semaphore        sem;
+       struct mutex            mutex;
        int                     needs_read_fill;
 };
 
@@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
 
        count = ops->show_attribute(item,attr,buffer->page);
        buffer->needs_read_fill = 0;
-       BUG_ON(count > (ssize_t)PAGE_SIZE);
+       BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
        if (count >= 0)
                buffer->count = count;
        else
@@ -102,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
        struct configfs_buffer * buffer = file->private_data;
        ssize_t retval = 0;
 
-       down(&buffer->sem);
+       mutex_lock(&buffer->mutex);
        if (buffer->needs_read_fill) {
                if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
                        goto out;
@@ -112,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
        retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
                                         buffer->count);
 out:
-       up(&buffer->sem);
+       mutex_unlock(&buffer->mutex);
        return retval;
 }
 
@@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
        if (!buffer->page)
                return -ENOMEM;
 
-       if (count >= PAGE_SIZE)
-               count = PAGE_SIZE - 1;
+       if (count >= SIMPLE_ATTR_SIZE)
+               count = SIMPLE_ATTR_SIZE - 1;
        error = copy_from_user(buffer->page,buf,count);
        buffer->needs_read_fill = 1;
        /* if buf is assumed to contain a string, terminate it by \0,
@@ -193,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
        struct configfs_buffer * buffer = file->private_data;
        ssize_t len;
 
-       down(&buffer->sem);
+       mutex_lock(&buffer->mutex);
        len = fill_write_buffer(buffer, buf, count);
        if (len > 0)
                len = flush_write_buffer(file->f_path.dentry, buffer, count);
        if (len > 0)
                *ppos += len;
-       up(&buffer->sem);
+       mutex_unlock(&buffer->mutex);
        return len;
 }
 
@@ -253,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file)
                error = -ENOMEM;
                goto Enomem;
        }
-       init_MUTEX(&buffer->sem);
+       mutex_init(&buffer->mutex);
        buffer->needs_read_fill = 1;
        buffer->ops = ops;
        file->private_data = buffer;
@@ -292,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
        if (buffer) {
                if (buffer->page)
                        free_page((unsigned long)buffer->page);
+               mutex_destroy(&buffer->mutex);
                kfree(buffer);
        }
        return 0;
index 24421209f8544607072981c933ed5c6eb7425647..76dc4c3e5d5180079aa7d35d2c368f77fcabdfd5 100644 (file)
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item)
  *     dynamically allocated string that @item->ci_name points to.
  *     Otherwise, use the static @item->ci_namebuf array.
  */
-
 int config_item_set_name(struct config_item * item, const char * fmt, ...)
 {
        int error = 0;
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item)
        return item;
 }
 
-/**
- *     config_item_cleanup - free config_item resources.
- *     @item:  item.
- */
-
-void config_item_cleanup(struct config_item * item)
+static void config_item_cleanup(struct config_item * item)
 {
        struct config_item_type * t = item->ci_type;
        struct config_group * s = item->ci_group;
@@ -179,39 +173,35 @@ void config_item_put(struct config_item * item)
                kref_put(&item->ci_kref, config_item_release);
 }
 
-
 /**
  *     config_group_init - initialize a group for use
  *     @k:     group
  */
-
 void config_group_init(struct config_group *group)
 {
        config_item_init(&group->cg_item);
        INIT_LIST_HEAD(&group->cg_children);
 }
 
-
 /**
- *     config_group_find_obj - search for item in group.
+ *     config_group_find_item - search for item in group.
  *     @group: group we're looking in.
  *     @name:  item's name.
  *
- *     Lock group via @group->cg_subsys, and iterate over @group->cg_list,
- *     looking for a matching config_item. If matching item is found
- *     take a reference and return the item.
+ *     Iterate over @group->cg_list, looking for a matching config_item.
+ *     If matching item is found take a reference and return the item.
+ *     Caller must have locked group via @group->cg_subsys->su_mtx.
  */
-
-struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+struct config_item *config_group_find_item(struct config_group *group,
+                                          const char *name)
 {
        struct list_head * entry;
        struct config_item * ret = NULL;
 
-        /* XXX LOCKING! */
        list_for_each(entry,&group->cg_children) {
                struct config_item * item = to_item(entry);
                if (config_item_name(item) &&
-                    !strcmp(config_item_name(item), name)) {
+                   !strcmp(config_item_name(item), name)) {
                        ret = config_item_get(item);
                        break;
                }
@@ -219,9 +209,8 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch
        return ret;
 }
 
-
 EXPORT_SYMBOL(config_item_init);
 EXPORT_SYMBOL(config_group_init);
 EXPORT_SYMBOL(config_item_get);
 EXPORT_SYMBOL(config_item_put);
-EXPORT_SYMBOL(config_group_find_obj);
+EXPORT_SYMBOL(config_group_find_item);
index 5069b2cb5a1f6e3fe109d148e3a7e8cd8b0930c7..2f8e3c81bc19c7b57646f3beb6d2b9469842135a 100644 (file)
@@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
        return len;
 }
 
-#define __CONFIGFS_ATTR(_name,_mode,_read,_write) {                           \
-       .attr   = { .ca_name = __stringify(_name),                            \
-                   .ca_mode = _mode,                                         \
-                   .ca_owner = THIS_MODULE },                                \
-       .show   = _read,                                                      \
-       .store  = _write,                                                     \
-}
-
 #define CLUSTER_ATTR(name, check_zero)                                        \
 static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
 {                                                                             \
@@ -615,7 +607,7 @@ static struct clusters clusters_root = {
 int dlm_config_init(void)
 {
        config_group_init(&clusters_root.subsys.su_group);
-       init_MUTEX(&clusters_root.subsys.su_sem);
+       mutex_init(&clusters_root.subsys.su_mutex);
        return configfs_register_subsystem(&clusters_root.subsys);
 }
 
@@ -759,9 +751,9 @@ static struct space *get_space(char *name)
        if (!space_list)
                return NULL;
 
-       down(&space_list->cg_subsys->su_sem);
-       i = config_group_find_obj(space_list, name);
-       up(&space_list->cg_subsys->su_sem);
+       mutex_lock(&space_list->cg_subsys->su_mutex);
+       i = config_group_find_item(space_list, name);
+       mutex_unlock(&space_list->cg_subsys->su_mutex);
 
        return to_space(i);
 }
@@ -780,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
        if (!comm_list)
                return NULL;
 
-       down(&clusters_root.subsys.su_sem);
+       mutex_lock(&clusters_root.subsys.su_mutex);
 
        list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
                cm = to_comm(i);
@@ -800,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
                        break;
                }
        }
-       up(&clusters_root.subsys.su_sem);
+       mutex_unlock(&clusters_root.subsys.su_mutex);
 
        if (!found)
                cm = NULL;
index 19712a7d145feeeded8e6b21bfcd9ec7ac2a2349..f5e11f4fa952a424a5259d766a03c1d449c8ec31 100644 (file)
@@ -50,6 +50,8 @@
 #include "buffer_head_io.h"
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                        struct ocfs2_extent_block *eb);
 
 /*
  * Structures which describe a path through a btree, and functions to
@@ -116,6 +118,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
        }
 }
 
+/*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+       int i;
+
+       BUG_ON(path_root_bh(dest) != path_root_bh(src));
+       BUG_ON(path_root_el(dest) != path_root_el(src));
+
+       ocfs2_reinit_path(dest, 1);
+
+       for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+               dest->p_node[i].bh = src->p_node[i].bh;
+               dest->p_node[i].el = src->p_node[i].el;
+
+               if (dest->p_node[i].bh)
+                       get_bh(dest->p_node[i].bh);
+       }
+}
+
 /*
  * Make the *dest path the same as src and re-initialize src path to
  * have a root only.
@@ -212,10 +239,41 @@ out:
        return ret;
 }
 
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+       int ret = -1;
+       int i;
+       struct ocfs2_extent_rec *rec;
+       u32 rec_end, rec_start, clusters;
+
+       for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+               rec = &el->l_recs[i];
+
+               rec_start = le32_to_cpu(rec->e_cpos);
+               clusters = ocfs2_rec_clusters(el, rec);
+
+               rec_end = rec_start + clusters;
+
+               if (v_cluster >= rec_start && v_cluster < rec_end) {
+                       ret = i;
+                       break;
+               }
+       }
+
+       return ret;
+}
+
 enum ocfs2_contig_type {
        CONTIG_NONE = 0,
        CONTIG_LEFT,
-       CONTIG_RIGHT
+       CONTIG_RIGHT,
+       CONTIG_LEFTRIGHT,
 };
 
 
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
 {
        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
 
+       /*
+        * Refuse to coalesce extent records with different flag
+        * fields - we don't want to mix unwritten extents with user
+        * data.
+        */
+       if (ext->e_flags != insert_rec->e_flags)
+               return CONTIG_NONE;
+
        if (ocfs2_extents_adjacent(ext, insert_rec) &&
            ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
                        return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
        APPEND_TAIL,
 };
 
+enum ocfs2_split_type {
+       SPLIT_NONE = 0,
+       SPLIT_LEFT,
+       SPLIT_RIGHT,
+};
+
 struct ocfs2_insert_type {
+       enum ocfs2_split_type   ins_split;
        enum ocfs2_append_type  ins_appending;
        enum ocfs2_contig_type  ins_contig;
        int                     ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
        int                     ins_tree_depth;
 };
 
+struct ocfs2_merge_ctxt {
+       enum ocfs2_contig_type  c_contig_type;
+       int                     c_has_empty_extent;
+       int                     c_split_covers_rec;
+       int                     c_used_tail_recs;
+};
+
 /*
  * How many free extents have we got before we need more meta data?
  */
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
                        strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
                        eb->h_blkno = cpu_to_le64(first_blkno);
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
-                       /* we always use slot zero's suballocator */
-                       eb->h_suballoc_slot = 0;
-#else
                        eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
-#endif
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                            struct inode *inode,
                            struct buffer_head *fe_bh,
                            struct buffer_head *eb_bh,
-                           struct buffer_head *last_eb_bh,
+                           struct buffer_head **last_eb_bh,
                            struct ocfs2_alloc_context *meta_ac)
 {
        int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
        mlog_entry_void();
 
-       BUG_ON(!last_eb_bh);
+       BUG_ON(!last_eb_bh || !*last_eb_bh);
 
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                goto bail;
        }
 
-       eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+       eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
 
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * journal_dirty erroring as it won't unless we've aborted the
         * handle (in which case we would never be here) so reserving
         * the write with journal_access is all we need to do. */
-       status = ocfs2_journal_access(handle, inode, last_eb_bh,
+       status = ocfs2_journal_access(handle, inode, *last_eb_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * next_leaf on the previously last-extent-block. */
        fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
 
-       eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+       eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
 
-       status = ocfs2_journal_dirty(handle, last_eb_bh);
+       status = ocfs2_journal_dirty(handle, *last_eb_bh);
        if (status < 0)
                mlog_errno(status);
        status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                        mlog_errno(status);
        }
 
+       /*
+        * Some callers want to track the rightmost leaf so pass it
+        * back here.
+        */
+       brelse(*last_eb_bh);
+       get_bh(new_eb_bhs[0]);
+       *last_eb_bh = new_eb_bhs[0];
+
        status = 0;
 bail:
        if (new_eb_bhs) {
@@ -828,6 +910,87 @@ bail:
        return status;
 }
 
+/*
+ * Grow a b-tree so that it has more records.
+ *
+ * We might shift the tree depth in which case existing paths should
+ * be considered invalid.
+ *
+ * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
+ */
+static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
+                          struct buffer_head *di_bh, int *final_depth,
+                          struct buffer_head **last_eb_bh,
+                          struct ocfs2_alloc_context *meta_ac)
+{
+       int ret, shift;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct buffer_head *bh = NULL;
+
+       BUG_ON(meta_ac == NULL);
+
+       shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+       if (shift < 0) {
+               ret = shift;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /* We traveled all the way to the bottom of the allocation tree
+        * and didn't find room for any more extents - we need to add
+        * another tree level */
+       if (shift) {
+               BUG_ON(bh);
+               mlog(0, "need to shift tree depth (current = %d)\n", depth);
+
+               /* ocfs2_shift_tree_depth will return us a buffer with
+                * the new extent block (so we can pass that to
+                * ocfs2_add_branch). */
+               ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+                                            meta_ac, &bh);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               depth++;
+               if (depth == 1) {
+                       /*
+                        * Special case: we have room now if we shifted from
+                        * tree_depth 0, so no more work needs to be done.
+                        *
+                        * We won't be calling add_branch, so pass
+                        * back *last_eb_bh as the new leaf. At depth
+                        * zero, it should always be null so there's
+                        * no reason to brelse.
+                        */
+                       BUG_ON(*last_eb_bh);
+                       get_bh(bh);
+                       *last_eb_bh = bh;
+                       goto out;
+               }
+       }
+
+       /* call ocfs2_add_branch to add the final part of the tree with
+        * the new data. */
+       mlog(0, "add branch. bh = %p\n", bh);
+       ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+                              meta_ac);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+out:
+       if (final_depth)
+               *final_depth = depth;
+       brelse(bh);
+       return ret;
+}
+
 /*
  * This is only valid for leaf nodes, which are the only ones that can
  * have empty extents anyway.
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
 
 }
 
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+       int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+
+       BUG_ON(num_recs == 0);
+
+       if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+               num_recs--;
+               size = num_recs * sizeof(struct ocfs2_extent_rec);
+               memmove(&el->l_recs[0], &el->l_recs[1], size);
+               memset(&el->l_recs[num_recs], 0,
+                      sizeof(struct ocfs2_extent_rec));
+               el->l_next_free_rec = cpu_to_le16(num_recs);
+       }
+}
+
 /*
  * Create an empty extent record .
  *
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
         * immediately to their right.
         */
        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+       if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+               BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+               left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+       }
        left_clusters -= le32_to_cpu(left_rec->e_cpos);
        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
 
@@ -1531,10 +1714,16 @@ out:
        return ret;
 }
 
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+                                          int op_credits,
                                           struct ocfs2_path *path)
 {
-       int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+       int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
 
        if (handle->h_buffer_credits < credits)
                return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
        return 0;
 }
 
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+       int next_free = le16_to_cpu(el->l_next_free_rec);
+       unsigned int range;
+       struct ocfs2_extent_rec *rec;
+
+       if (next_free == 0)
+               return 0;
+
+       rec = &el->l_recs[0];
+       if (ocfs2_is_empty_extent(rec)) {
+               /* Empty list. */
+               if (next_free == 1)
+                       return 0;
+               rec = &el->l_recs[1];
+       }
+
+       range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+       if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+               return 1;
+       return 0;
+}
+
 /*
  * Rotate all the records in a btree right one record, starting at insert_cpos.
  *
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
  */
 static int ocfs2_rotate_tree_right(struct inode *inode,
                                   handle_t *handle,
+                                  enum ocfs2_split_type split,
                                   u32 insert_cpos,
                                   struct ocfs2_path *right_path,
                                   struct ocfs2_path **ret_left_path)
 {
-       int ret, start;
+       int ret, start, orig_credits = handle->h_buffer_credits;
        u32 cpos;
        struct ocfs2_path *left_path = NULL;
 
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                                (unsigned long long)
                                path_leaf_bh(left_path)->b_blocknr);
 
-               if (ocfs2_rotate_requires_path_adjustment(left_path,
+               if (split == SPLIT_NONE &&
+                   ocfs2_rotate_requires_path_adjustment(left_path,
                                                          insert_cpos)) {
-                       mlog(0, "Path adjustment required\n");
 
                        /*
                         * We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                     right_path->p_tree_depth);
 
                ret = ocfs2_extend_rotate_transaction(handle, start,
-                                                     right_path);
+                                                     orig_credits, right_path);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                        goto out;
                }
 
+               if (split != SPLIT_NONE &&
+                   ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+                                               insert_cpos)) {
+                       /*
+                        * A rotate moves the rightmost left leaf
+                        * record over to the leftmost right leaf
+                        * slot. If we're doing an extent split
+                        * instead of a real insert, then we have to
+                        * check that the extent to be split wasn't
+                        * just moved over. If it was, then we can
+                        * exit here, passing left_path back -
+                        * ocfs2_split_extent() is smart enough to
+                        * search both leaves.
+                        */
+                       *ret_left_path = left_path;
+                       goto out_ret_path;
+               }
+
                /*
                 * There is no need to re-read the next right path
                 * as we know that it'll be our current left
@@ -1722,557 +1953,1725 @@ out_ret_path:
        return ret;
 }
 
-/*
- * Do the final bits of extent record insertion at the target leaf
- * list. If this leaf is part of an allocation tree, it is assumed
- * that the tree above has been prepared.
- */
-static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
-                                struct ocfs2_extent_list *el,
-                                struct ocfs2_insert_type *insert,
-                                struct inode *inode)
+static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+                                     struct ocfs2_path *path)
 {
-       int i = insert->ins_contig_index;
-       unsigned int range;
+       int i, idx;
        struct ocfs2_extent_rec *rec;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_extent_block *eb;
+       u32 range;
 
-       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+       /* Path should always be rightmost. */
+       eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+       BUG_ON(eb->h_next_leaf_blk != 0ULL);
 
-       /*
-        * Contiguous insert - either left or right.
-        */
-       if (insert->ins_contig != CONTIG_NONE) {
-               rec = &el->l_recs[i];
-               if (insert->ins_contig == CONTIG_LEFT) {
-                       rec->e_blkno = insert_rec->e_blkno;
-                       rec->e_cpos = insert_rec->e_cpos;
-               }
-               le16_add_cpu(&rec->e_leaf_clusters,
-                            le16_to_cpu(insert_rec->e_leaf_clusters));
-               return;
-       }
+       el = &eb->h_list;
+       BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+       idx = le16_to_cpu(el->l_next_free_rec) - 1;
+       rec = &el->l_recs[idx];
+       range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
 
-       /*
-        * Handle insert into an empty leaf.
-        */
-       if (le16_to_cpu(el->l_next_free_rec) == 0 ||
-           ((le16_to_cpu(el->l_next_free_rec) == 1) &&
-            ocfs2_is_empty_extent(&el->l_recs[0]))) {
-               el->l_recs[0] = *insert_rec;
-               el->l_next_free_rec = cpu_to_le16(1);
-               return;
-       }
+       for (i = 0; i < path->p_tree_depth; i++) {
+               el = path->p_node[i].el;
+               idx = le16_to_cpu(el->l_next_free_rec) - 1;
+               rec = &el->l_recs[idx];
 
-       /*
-        * Appending insert.
-        */
-       if (insert->ins_appending == APPEND_TAIL) {
-               i = le16_to_cpu(el->l_next_free_rec) - 1;
-               rec = &el->l_recs[i];
-               range = le32_to_cpu(rec->e_cpos)
-                       + le16_to_cpu(rec->e_leaf_clusters);
-               BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+               rec->e_int_clusters = cpu_to_le32(range);
+               le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
 
-               mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
-                               le16_to_cpu(el->l_count),
-                               "inode %lu, depth %u, count %u, next free %u, "
-                               "rec.cpos %u, rec.clusters %u, "
-                               "insert.cpos %u, insert.clusters %u\n",
-                               inode->i_ino,
-                               le16_to_cpu(el->l_tree_depth),
-                               le16_to_cpu(el->l_count),
-                               le16_to_cpu(el->l_next_free_rec),
-                               le32_to_cpu(el->l_recs[i].e_cpos),
-                               le16_to_cpu(el->l_recs[i].e_leaf_clusters),
-                               le32_to_cpu(insert_rec->e_cpos),
-                               le16_to_cpu(insert_rec->e_leaf_clusters));
-               i++;
-               el->l_recs[i] = *insert_rec;
-               le16_add_cpu(&el->l_next_free_rec, 1);
-               return;
+               ocfs2_journal_dirty(handle, path->p_node[i].bh);
        }
-
-       /*
-        * Ok, we have to rotate.
-        *
-        * At this point, it is safe to assume that inserting into an
-        * empty leaf and appending to a leaf have both been handled
-        * above.
-        *
-        * This leaf needs to have space, either by the empty 1st
-        * extent record, or by virtue of an l_next_rec < l_count.
-        */
-       ocfs2_rotate_leaf(el, insert_rec);
-}
-
-static inline void ocfs2_update_dinode_clusters(struct inode *inode,
-                                               struct ocfs2_dinode *di,
-                                               u32 clusters)
-{
-       le32_add_cpu(&di->i_clusters, clusters);
-       spin_lock(&OCFS2_I(inode)->ip_lock);
-       OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
-       spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
 
-static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
-                                   struct ocfs2_extent_rec *insert_rec,
-                                   struct ocfs2_path *right_path,
-                                   struct ocfs2_path **ret_left_path)
+static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             struct ocfs2_path *path, int unlink_start)
 {
-       int ret, i, next_free;
-       struct buffer_head *bh;
+       int ret, i;
+       struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-       struct ocfs2_path *left_path = NULL;
+       struct buffer_head *bh;
 
-       *ret_left_path = NULL;
+       for(i = unlink_start; i < path_num_items(path); i++) {
+               bh = path->p_node[i].bh;
 
-       /*
-        * This shouldn't happen for non-trees. The extent rec cluster
-        * count manipulation below only works for interior nodes.
-        */
-       BUG_ON(right_path->p_tree_depth == 0);
+               eb = (struct ocfs2_extent_block *)bh->b_data;
+               /*
+                * Not all nodes might have had their final count
+                * decremented by the caller - handle this here.
+                */
+               el = &eb->h_list;
+               if (le16_to_cpu(el->l_next_free_rec) > 1) {
+                       mlog(ML_ERROR,
+                            "Inode %llu, attempted to remove extent block "
+                            "%llu with %u records\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                            le16_to_cpu(el->l_next_free_rec));
+
+                       ocfs2_journal_dirty(handle, bh);
+                       ocfs2_remove_from_cache(inode, bh);
+                       continue;
+               }
 
-       /*
-        * If our appending insert is at the leftmost edge of a leaf,
-        * then we might need to update the rightmost records of the
-        * neighboring path.
-        */
-       el = path_leaf_el(right_path);
-       next_free = le16_to_cpu(el->l_next_free_rec);
-       if (next_free == 0 ||
-           (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
-               u32 left_cpos;
+               el->l_next_free_rec = 0;
+               memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
 
-               ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
-                                                   &left_cpos);
-               if (ret) {
+               ocfs2_journal_dirty(handle, bh);
+
+               ret = ocfs2_cache_extent_block_free(dealloc, eb);
+               if (ret)
                        mlog_errno(ret);
-                       goto out;
-               }
 
-               mlog(0, "Append may need a left path update. cpos: %u, "
-                    "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
-                    left_cpos);
+               ocfs2_remove_from_cache(inode, bh);
+       }
+}
 
-               /*
-                * No need to worry if the append is already in the
-                * leftmost leaf.
+static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
+                                struct ocfs2_path *left_path,
+                                struct ocfs2_path *right_path,
+                                int subtree_index,
+                                struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+       int i;
+       struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+       struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_extent_block *eb;
+
+       el = path_leaf_el(left_path);
+
+       eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
+
+       for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+               if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+                       break;
+
+       BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
+
+       memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+       le16_add_cpu(&root_el->l_next_free_rec, -1);
+
+       eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+       eb->h_next_leaf_blk = 0;
+
+       ocfs2_journal_dirty(handle, root_bh);
+       ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+       ocfs2_unlink_path(inode, handle, dealloc, right_path,
+                         subtree_index + 1);
+}
+
+static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
+                                    struct ocfs2_path *left_path,
+                                    struct ocfs2_path *right_path,
+                                    int subtree_index,
+                                    struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                    int *deleted)
+{
+       int ret, i, del_right_subtree = 0, right_has_empty = 0;
+       struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+       struct ocfs2_extent_block *eb;
+
+       *deleted = 0;
+
+       right_leaf_el = path_leaf_el(right_path);
+       left_leaf_el = path_leaf_el(left_path);
+       root_bh = left_path->p_node[subtree_index].bh;
+       BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+       if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+               return 0;
+
+       eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+       if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
+               /*
+                * It's legal for us to proceed if the right leaf is
+                * the rightmost one and it has an empty extent. There
+                * are two cases to handle - whether the leaf will be
+                * empty after removal or not. If the leaf isn't empty
+                * then just remove the empty extent up front. The
+                * next block will handle empty leaves by flagging
+                * them for unlink.
+                *
+                * Non rightmost leaves will throw -EAGAIN and the
+                * caller can manually move the subtree and retry.
                 */
-               if (left_cpos) {
-                       left_path = ocfs2_new_path(path_root_bh(right_path),
-                                                  path_root_el(right_path));
-                       if (!left_path) {
-                               ret = -ENOMEM;
-                               mlog_errno(ret);
-                               goto out;
-                       }
 
-                       ret = ocfs2_find_path(inode, left_path, left_cpos);
+               if (eb->h_next_leaf_blk != 0ULL)
+                       return -EAGAIN;
+
+               if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+                       ret = ocfs2_journal_access(handle, inode,
+                                                  path_leaf_bh(right_path),
+                                                  OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
 
-                       /*
-                        * ocfs2_insert_path() will pass the left_path to the
-                        * journal for us.
-                        */
+                       ocfs2_remove_empty_extent(right_leaf_el);
+               } else
+                       right_has_empty = 1;
+       }
+
+       if (eb->h_next_leaf_blk == 0ULL &&
+           le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+               /*
+                * We have to update i_last_eb_blk during the meta
+                * data delete.
+                */
+               ret = ocfs2_journal_access(handle, inode, di_bh,
+                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
                }
+
+               del_right_subtree = 1;
        }
 
-       ret = ocfs2_journal_access_path(inode, handle, right_path);
+       /*
+        * Getting here with an empty extent in the right path implies
+        * that it's the rightmost path and will be deleted.
+        */
+       BUG_ON(right_has_empty && !del_right_subtree);
+
+       ret = ocfs2_journal_access(handle, inode, root_bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
 
-       el = path_root_el(right_path);
-       bh = path_root_bh(right_path);
-       i = 0;
-       while (1) {
-               struct ocfs2_extent_rec *rec;
+       for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+               ret = ocfs2_journal_access(handle, inode,
+                                          right_path->p_node[i].bh,
+                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
 
-               next_free = le16_to_cpu(el->l_next_free_rec);
-               if (next_free == 0) {
-                       ocfs2_error(inode->i_sb,
-                                   "Dinode %llu has a bad extent list",
-                                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                       ret = -EIO;
+               ret = ocfs2_journal_access(handle, inode,
+                                          left_path->p_node[i].bh,
+                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret) {
+                       mlog_errno(ret);
                        goto out;
                }
+       }
 
-               rec = &el->l_recs[next_free - 1];
+       if (!right_has_empty) {
+               /*
+                * Only do this if we're moving a real
+                * record. Otherwise, the action is delayed until
+                * after removal of the right path in which case we
+                * can do a simple shift to remove the empty extent.
+                */
+               ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+               memset(&right_leaf_el->l_recs[0], 0,
+                      sizeof(struct ocfs2_extent_rec));
+       }
+       if (eb->h_next_leaf_blk == 0ULL) {
+               /*
+                * Move recs over to get rid of empty extent, decrease
+                * next_free. This is allowed to remove the last
+                * extent in our leaf (setting l_next_free_rec to
+                * zero) - the delete code below won't care.
+                */
+               ocfs2_remove_empty_extent(right_leaf_el);
+       }
 
-               rec->e_int_clusters = insert_rec->e_cpos;
-               le32_add_cpu(&rec->e_int_clusters,
-                            le16_to_cpu(insert_rec->e_leaf_clusters));
-               le32_add_cpu(&rec->e_int_clusters,
-                            -le32_to_cpu(rec->e_cpos));
+       ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+       if (ret)
+               mlog_errno(ret);
+       ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+       if (ret)
+               mlog_errno(ret);
 
-               ret = ocfs2_journal_dirty(handle, bh);
+       if (del_right_subtree) {
+               ocfs2_unlink_subtree(inode, handle, left_path, right_path,
+                                    subtree_index, dealloc);
+               ocfs2_update_edge_lengths(inode, handle, left_path);
+
+               eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+               di->i_last_eb_blk = eb->h_blkno;
+
+               /*
+                * Removal of the extent in the left leaf was skipped
+                * above so we could delete the right path
+                * 1st.
+                */
+               if (right_has_empty)
+                       ocfs2_remove_empty_extent(left_leaf_el);
+
+               ret = ocfs2_journal_dirty(handle, di_bh);
                if (ret)
                        mlog_errno(ret);
 
-               /* Don't touch the leaf node */
-               if (++i >= right_path->p_tree_depth)
-                       break;
-
-               bh = right_path->p_node[i].bh;
-               el = right_path->p_node[i].el;
-       }
+               *deleted = 1;
+       } else
+               ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+                                          subtree_index);
 
-       *ret_left_path = left_path;
-       ret = 0;
 out:
-       if (ret != 0)
-               ocfs2_free_path(left_path);
-
        return ret;
 }
 
 /*
- * This function only does inserts on an allocation b-tree. For dinode
- * lists, ocfs2_insert_at_leaf() is called directly.
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
  *
- * right_path is the path we want to do the actual insert
- * in. left_path should only be passed in if we need to update that
- * portion of the tree after an edge insert.
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
  */
-static int ocfs2_insert_path(struct inode *inode,
-                            handle_t *handle,
-                            struct ocfs2_path *left_path,
-                            struct ocfs2_path *right_path,
-                            struct ocfs2_extent_rec *insert_rec,
-                            struct ocfs2_insert_type *insert)
+static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+                                         struct ocfs2_path *path, u32 *cpos)
 {
-       int ret, subtree_index;
-       struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+       int i, j, ret = 0;
+       u64 blkno;
        struct ocfs2_extent_list *el;
 
-       /*
-        * Pass both paths to the journal. The majority of inserts
-        * will be touching all components anyway.
-        */
-       ret = ocfs2_journal_access_path(inode, handle, right_path);
-       if (ret < 0) {
-               mlog_errno(ret);
-               goto out;
-       }
+       *cpos = 0;
 
-       if (left_path) {
-               int credits = handle->h_buffer_credits;
+       if (path->p_tree_depth == 0)
+               return 0;
+
+       blkno = path_leaf_bh(path)->b_blocknr;
+
+       /* Start at the tree node just above the leaf and work our way up. */
+       i = path->p_tree_depth - 1;
+       while (i >= 0) {
+               int next_free;
+
+               el = path->p_node[i].el;
 
                /*
-                * There's a chance that left_path got passed back to
-                * us without being accounted for in the
-                * journal. Extend our transaction here to be sure we
-                * can change those blocks.
+                * Find the extent record just after the one in our
+                * path.
                 */
-               credits += left_path->p_tree_depth;
+               next_free = le16_to_cpu(el->l_next_free_rec);
+               for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+                       if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+                               if (j == (next_free - 1)) {
+                                       if (i == 0) {
+                                               /*
+                                                * We've determined that the
+                                                * path specified is already
+                                                * the rightmost one - return a
+                                                * cpos of zero.
+                                                */
+                                               goto out;
+                                       }
+                                       /*
+                                        * The rightmost record points to our
+                                        * leaf - we need to travel up the
+                                        * tree one level.
+                                        */
+                                       goto next_node;
+                               }
 
-               ret = ocfs2_extend_trans(handle, credits);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out;
+                               *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+                               goto out;
+                       }
                }
 
-               ret = ocfs2_journal_access_path(inode, handle, left_path);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out;
-               }
+               /*
+                * If we got here, we never found a valid node where
+                * the tree indicated one should be.
+                */
+               ocfs2_error(sb,
+                           "Invalid extent tree at extent block %llu\n",
+                           (unsigned long long)blkno);
+               ret = -EROFS;
+               goto out;
+
+next_node:
+               blkno = path->p_node[i].bh->b_blocknr;
+               i--;
        }
 
-       el = path_leaf_el(right_path);
+out:
+       return ret;
+}
 
-       ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
-       ret = ocfs2_journal_dirty(handle, leaf_bh);
-       if (ret)
-               mlog_errno(ret);
+static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
+                                           handle_t *handle,
+                                           struct buffer_head *bh,
+                                           struct ocfs2_extent_list *el)
+{
+       int ret;
 
-       if (left_path) {
-               /*
-                * The rotate code has indicated that we need to fix
-                * up portions of the tree after the insert.
-                *
-                * XXX: Should we extend the transaction here?
-                */
-               subtree_index = ocfs2_find_subtree_root(inode, left_path,
-                                                       right_path);
-               ocfs2_complete_edge_insert(inode, handle, left_path,
-                                          right_path, subtree_index);
+       if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+               return 0;
+
+       ret = ocfs2_journal_access(handle, inode, bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
        }
 
-       ret = 0;
+       ocfs2_remove_empty_extent(el);
+
+       ret = ocfs2_journal_dirty(handle, bh);
+       if (ret)
+               mlog_errno(ret);
+
 out:
        return ret;
 }
 
-static int ocfs2_do_insert_extent(struct inode *inode,
-                                 handle_t *handle,
-                                 struct buffer_head *di_bh,
-                                 struct ocfs2_extent_rec *insert_rec,
-                                 struct ocfs2_insert_type *type)
+static int __ocfs2_rotate_tree_left(struct inode *inode,
+                                   handle_t *handle, int orig_credits,
+                                   struct ocfs2_path *path,
+                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                   struct ocfs2_path **empty_extent_path)
 {
-       int ret, rotate = 0;
-       u32 cpos;
-       struct ocfs2_path *right_path = NULL;
+       int ret, subtree_root, deleted;
+       u32 right_cpos;
        struct ocfs2_path *left_path = NULL;
-       struct ocfs2_dinode *di;
-       struct ocfs2_extent_list *el;
+       struct ocfs2_path *right_path = NULL;
 
-       di = (struct ocfs2_dinode *) di_bh->b_data;
-       el = &di->id2.i_list;
+       BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
 
-       ret = ocfs2_journal_access(handle, inode, di_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       *empty_extent_path = NULL;
+
+       ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
+                                            &right_cpos);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
 
-       if (le16_to_cpu(el->l_tree_depth) == 0) {
-               ocfs2_insert_at_leaf(insert_rec, el, type, inode);
-               goto out_update_clusters;
+       left_path = ocfs2_new_path(path_root_bh(path),
+                                  path_root_el(path));
+       if (!left_path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
        }
 
-       right_path = ocfs2_new_inode_path(di_bh);
+       ocfs2_cp_path(left_path, path);
+
+       right_path = ocfs2_new_path(path_root_bh(path),
+                                   path_root_el(path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
        }
 
-       /*
-        * Determine the path to start with. Rotations need the
-        * rightmost path, everything else can go directly to the
-        * target leaf.
-        */
-       cpos = le32_to_cpu(insert_rec->e_cpos);
-       if (type->ins_appending == APPEND_NONE &&
-           type->ins_contig == CONTIG_NONE) {
-               rotate = 1;
-               cpos = UINT_MAX;
-       }
+       while (right_cpos) {
+               ret = ocfs2_find_path(inode, right_path, right_cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
 
-       ret = ocfs2_find_path(inode, right_path, cpos);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
+               subtree_root = ocfs2_find_subtree_root(inode, left_path,
+                                                      right_path);
 
-       /*
-        * Rotations and appends need special treatment - they modify
-        * parts of the tree's above them.
-        *
-        * Both might pass back a path immediate to the left of the
-        * one being inserted to. This will be cause
-        * ocfs2_insert_path() to modify the rightmost records of
-        * left_path to account for an edge insert.
-        *
-        * XXX: When modifying this code, keep in mind that an insert
-        * can wind up skipping both of these two special cases...
-        */
-       if (rotate) {
-               ret = ocfs2_rotate_tree_right(inode, handle,
-                                             le32_to_cpu(insert_rec->e_cpos),
-                                             right_path, &left_path);
+               mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+                    subtree_root,
+                    (unsigned long long)
+                    right_path->p_node[subtree_root].bh->b_blocknr,
+                    right_path->p_tree_depth);
+
+               ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+                                                     orig_credits, left_path);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-       } else if (type->ins_appending == APPEND_TAIL
-                  && type->ins_contig != CONTIG_LEFT) {
-               ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
-                                              right_path, &left_path);
+
+               ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
+                                               right_path, subtree_root,
+                                               dealloc, &deleted);
+               if (ret == -EAGAIN) {
+                       /*
+                        * The rotation has to temporarily stop due to
+                        * the right subtree having an empty
+                        * extent. Pass it back to the caller for a
+                        * fixup.
+                        */
+                       *empty_extent_path = right_path;
+                       right_path = NULL;
+                       goto out;
+               }
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-       }
 
-       ret = ocfs2_insert_path(inode, handle, left_path, right_path,
-                               insert_rec, type);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
+               /*
+                * The subtree rotate might have removed records on
+                * the rightmost edge. If so, then rotation is
+                * complete.
+                */
+               if (deleted)
+                       break;
 
-out_update_clusters:
-       ocfs2_update_dinode_clusters(inode, di,
-                                    le16_to_cpu(insert_rec->e_leaf_clusters));
+               ocfs2_mv_path(left_path, right_path);
 
-       ret = ocfs2_journal_dirty(handle, di_bh);
-       if (ret)
-               mlog_errno(ret);
+               ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+                                                    &right_cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
 
 out:
-       ocfs2_free_path(left_path);
        ocfs2_free_path(right_path);
+       ocfs2_free_path(left_path);
 
        return ret;
 }
 
-static void ocfs2_figure_contig_type(struct inode *inode,
-                                    struct ocfs2_insert_type *insert,
-                                    struct ocfs2_extent_list *el,
-                                    struct ocfs2_extent_rec *insert_rec)
+static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
+                                      struct ocfs2_path *path,
+                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
-       int i;
-       enum ocfs2_contig_type contig_type = CONTIG_NONE;
+       int ret, subtree_index;
+       u32 cpos;
+       struct ocfs2_path *left_path = NULL;
+       struct ocfs2_dinode *di;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list *el;
 
-       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+       /*
+        * XXX: This code assumes that the root is an inode, which is
+        * true for now but may change as tree code gets generic.
+        */
+       di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
+       if (!OCFS2_IS_VALID_DINODE(di)) {
+               ret = -EIO;
+               ocfs2_error(inode->i_sb,
+                           "Inode %llu has invalid path root",
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno);
+               goto out;
+       }
 
-       for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-               contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
-                                                 insert_rec);
-               if (contig_type != CONTIG_NONE) {
-                       insert->ins_contig_index = i;
-                       break;
-               }
+       /*
+        * There's two ways we handle this depending on
+        * whether path is the only existing one.
+        */
+       ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                             handle->h_buffer_credits,
+                                             path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
        }
-       insert->ins_contig = contig_type;
-}
 
-/*
- * This should only be called against the righmost leaf extent list.
- *
- * ocfs2_figure_appending_type() will figure out whether we'll have to
- * insert at the tail of the rightmost leaf.
- *
- * This should also work against the dinode list for tree's with 0
- * depth. If we consider the dinode list to be the rightmost leaf node
- * then the logic here makes sense.
- */
-static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
-                                       struct ocfs2_extent_list *el,
-                                       struct ocfs2_extent_rec *insert_rec)
-{
-       int i;
-       u32 cpos = le32_to_cpu(insert_rec->e_cpos);
-       struct ocfs2_extent_rec *rec;
+       ret = ocfs2_journal_access_path(inode, handle, path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
 
-       insert->ins_appending = APPEND_NONE;
+       ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
 
-       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+       if (cpos) {
+               /*
+                * We have a path to the left of this one - it needs
+                * an update too.
+                */
+               left_path = ocfs2_new_path(path_root_bh(path),
+                                          path_root_el(path));
+               if (!left_path) {
+                       ret = -ENOMEM;
+                       mlog_errno(ret);
+                       goto out;
+               }
 
-       if (!el->l_next_free_rec)
-               goto set_tail_append;
+               ret = ocfs2_find_path(inode, left_path, cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
 
-       if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-               /* Were all records empty? */
-               if (le16_to_cpu(el->l_next_free_rec) == 1)
-                       goto set_tail_append;
-       }
+               ret = ocfs2_journal_access_path(inode, handle, left_path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
 
-       i = le16_to_cpu(el->l_next_free_rec) - 1;
-       rec = &el->l_recs[i];
+               subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
 
-       if (cpos >=
-           (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
-               goto set_tail_append;
+               ocfs2_unlink_subtree(inode, handle, left_path, path,
+                                    subtree_index, dealloc);
+               ocfs2_update_edge_lengths(inode, handle, left_path);
 
-       return;
+               eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+               di->i_last_eb_blk = eb->h_blkno;
+       } else {
+               /*
+                * 'path' is also the leftmost path which
+                * means it must be the only one. This gets
+                * handled differently because we want to
+                * revert the inode back to having extents
+                * in-line.
+                */
+               ocfs2_unlink_path(inode, handle, dealloc, path, 1);
 
-set_tail_append:
-       insert->ins_appending = APPEND_TAIL;
+               el = &di->id2.i_list;
+               el->l_tree_depth = 0;
+               el->l_next_free_rec = 0;
+               memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+               di->i_last_eb_blk = 0;
+       }
+
+       ocfs2_journal_dirty(handle, path_root_bh(path));
+
+out:
+       ocfs2_free_path(left_path);
+       return ret;
 }
 
 /*
- * Helper function called at the begining of an insert.
+ * Left rotation of btree records.
  *
- * This computes a few things that are commonly used in the process of
- * inserting into the btree:
- *   - Whether the new extent is contiguous with an existing one.
- *   - The current tree depth.
- *   - Whether the insert is an appending one.
- *   - The total # of free records in the tree.
+ * In many ways, this is (unsurprisingly) the opposite of right
+ * rotation. We start at some non-rightmost path containing an empty
+ * extent in the leaf block. The code works its way to the rightmost
+ * path by rotating records to the left in every subtree.
  *
- * All of the information is stored on the ocfs2_insert_type
- * structure.
+ * This is used by any code which reduces the number of extent records
+ * in a leaf. After removal, an empty record should be placed in the
+ * leftmost list position.
+ *
+ * This won't handle a length update of the rightmost path records if
+ * the rightmost tree leaf record is removed so the caller is
+ * responsible for detecting and correcting that.
  */
-static int ocfs2_figure_insert_type(struct inode *inode,
-                                   struct buffer_head *di_bh,
-                                   struct buffer_head **last_eb_bh,
-                                   struct ocfs2_extent_rec *insert_rec,
-                                   struct ocfs2_insert_type *insert)
+static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
+                                 struct ocfs2_path *path,
+                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
-       int ret;
-       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       int ret, orig_credits = handle->h_buffer_credits;
+       struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-       struct ocfs2_path *path = NULL;
-       struct buffer_head *bh = NULL;
 
-       el = &di->id2.i_list;
-       insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+       el = path_leaf_el(path);
+       if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+               return 0;
 
-       if (el->l_tree_depth) {
+       if (path->p_tree_depth == 0) {
+rightmost_no_delete:
                /*
-                * If we have tree depth, we read in the
-                * rightmost extent block ahead of time as
-                * ocfs2_figure_insert_type() and ocfs2_add_branch()
-                * may want it later.
+                * In-inode extents. This is trivially handled, so do
+                * it up front.
                 */
-               ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-                                      le64_to_cpu(di->i_last_eb_blk), &bh,
-                                      OCFS2_BH_CACHED, inode);
-               if (ret) {
-                       mlog_exit(ret);
-                       goto out;
-               }
-               eb = (struct ocfs2_extent_block *) bh->b_data;
-               el = &eb->h_list;
+               ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
+                                                      path_leaf_bh(path),
+                                                      path_leaf_el(path));
+               if (ret)
+                       mlog_errno(ret);
+               goto out;
        }
 
        /*
-        * Unless we have a contiguous insert, we'll need to know if
-        * there is room left in our allocation tree for another
-        * extent record.
+        * Handle rightmost branch now. There's several cases:
+        *  1) simple rotation leaving records in there. That's trivial.
+        *  2) rotation requiring a branch delete - there's no more
+        *     records left. Two cases of this:
+        *     a) There are branches to the left.
+        *     b) This is also the leftmost (the only) branch.
         *
-        * XXX: This test is simplistic, we can search for empty
-        * extent records too.
+        *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
+        *  2a) we need the left branch so that we can update it with the unlink
+        *  2b) we need to bring the inode back to inline extents.
         */
-       insert->ins_free_records = le16_to_cpu(el->l_count) -
-               le16_to_cpu(el->l_next_free_rec);
 
-       if (!insert->ins_tree_depth) {
-               ocfs2_figure_contig_type(inode, insert, el, insert_rec);
-               ocfs2_figure_appending_type(insert, el, insert_rec);
-               return 0;
-       }
+       eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+       el = &eb->h_list;
+       if (eb->h_next_leaf_blk == 0) {
+               /*
+                * This gets a bit tricky if we're going to delete the
+                * rightmost path. Get the other cases out of the way
+                * 1st.
+                */
+               if (le16_to_cpu(el->l_next_free_rec) > 1)
+                       goto rightmost_no_delete;
 
-       path = ocfs2_new_inode_path(di_bh);
-       if (!path) {
-               ret = -ENOMEM;
-               mlog_errno(ret);
+               if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                       ret = -EIO;
+                       ocfs2_error(inode->i_sb,
+                                   "Inode %llu has empty extent block at %llu",
+                                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                   (unsigned long long)le64_to_cpu(eb->h_blkno));
+                       goto out;
+               }
+
+               /*
+                * XXX: The caller can not trust "path" any more after
+                * this as it will have been deleted. What do we do?
+                *
+                * In theory the rotate-for-merge code will never get
+                * here because it'll always ask for a rotate in a
+                * nonempty list.
+                */
+
+               ret = ocfs2_remove_rightmost_path(inode, handle, path,
+                                                 dealloc);
+               if (ret)
+                       mlog_errno(ret);
                goto out;
        }
 
        /*
-        * In the case that we're inserting past what the tree
-        * currently accounts for, ocfs2_find_path() will return for
-        * us the rightmost tree path. This is accounted for below in
-        * the appending code.
+        * Now we can loop, remembering the path we get from -EAGAIN
+        * and restarting from there.
         */
-       ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
-       if (ret) {
+try_rotate:
+       ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
+                                      dealloc, &restart_path);
+       if (ret && ret != -EAGAIN) {
                mlog_errno(ret);
                goto out;
        }
 
-       el = path_leaf_el(path);
-
-       /*
-        * Now that we have the path, there's two things we want to determine:
-        * 1) Contiguousness (also set contig_index if this is so)
-        *
-        * 2) Are we doing an append? We can trivially break this up
-         *     into two types of appends: simple record append, or a
-         *     rotate inside the tail leaf.
-        */
-       ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+       while (ret == -EAGAIN) {
+               tmp_path = restart_path;
+               restart_path = NULL;
+
+               ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
+                                              tmp_path, dealloc,
+                                              &restart_path);
+               if (ret && ret != -EAGAIN) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               ocfs2_free_path(tmp_path);
+               tmp_path = NULL;
+
+               if (ret == 0)
+                       goto try_rotate;
+       }
+
+out:
+       ocfs2_free_path(tmp_path);
+       ocfs2_free_path(restart_path);
+       return ret;
+}
+
+static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
+                               int index)
+{
+       struct ocfs2_extent_rec *rec = &el->l_recs[index];
+       unsigned int size;
+
+       if (rec->e_leaf_clusters == 0) {
+               /*
+                * We consumed all of the merged-from record. An empty
+                * extent cannot exist anywhere but the 1st array
+                * position, so move things over if the merged-from
+                * record doesn't occupy that position.
+                *
+                * This creates a new empty extent so the caller
+                * should be smart enough to have removed any existing
+                * ones.
+                */
+               if (index > 0) {
+                       BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+                       size = index * sizeof(struct ocfs2_extent_rec);
+                       memmove(&el->l_recs[1], &el->l_recs[0], size);
+               }
+
+               /*
+                * Always memset - the caller doesn't check whether it
+                * created an empty extent, so there could be junk in
+                * the other fields.
+                */
+               memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+       }
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the beginning of the record at index + 1.
+ */
+static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
+                               handle_t *handle,
+                               struct ocfs2_extent_rec *split_rec,
+                               struct ocfs2_extent_list *el, int index)
+{
+       int ret;
+       unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+       struct ocfs2_extent_rec *left_rec;
+       struct ocfs2_extent_rec *right_rec;
+
+       BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
+
+       left_rec = &el->l_recs[index];
+       right_rec = &el->l_recs[index + 1];
+
+       ret = ocfs2_journal_access(handle, inode, bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
+
+       le32_add_cpu(&right_rec->e_cpos, -split_clusters);
+       le64_add_cpu(&right_rec->e_blkno,
+                    -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+       le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
+
+       ocfs2_cleanup_merge(el, index);
+
+       ret = ocfs2_journal_dirty(handle, bh);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the tail of the record at index - 1.
+ */
+static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+                               handle_t *handle,
+                               struct ocfs2_extent_rec *split_rec,
+                               struct ocfs2_extent_list *el, int index)
+{
+       int ret, has_empty_extent = 0;
+       unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+       struct ocfs2_extent_rec *left_rec;
+       struct ocfs2_extent_rec *right_rec;
+
+       BUG_ON(index <= 0);
+
+       left_rec = &el->l_recs[index - 1];
+       right_rec = &el->l_recs[index];
+       if (ocfs2_is_empty_extent(&el->l_recs[0]))
+               has_empty_extent = 1;
+
+       ret = ocfs2_journal_access(handle, inode, bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (has_empty_extent && index == 1) {
+               /*
+                * The easy case - we can just plop the record right in.
+                */
+               *left_rec = *split_rec;
+
+               has_empty_extent = 0;
+       } else {
+               le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
+       }
+
+       le32_add_cpu(&right_rec->e_cpos, split_clusters);
+       le64_add_cpu(&right_rec->e_blkno,
+                    ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+       le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
+
+       ocfs2_cleanup_merge(el, index);
+
+       ret = ocfs2_journal_dirty(handle, bh);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       return ret;
+}
+
+static int ocfs2_try_to_merge_extent(struct inode *inode,
+                                    handle_t *handle,
+                                    struct ocfs2_path *left_path,
+                                    int split_index,
+                                    struct ocfs2_extent_rec *split_rec,
+                                    struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                    struct ocfs2_merge_ctxt *ctxt)
+
+{
+       int ret = 0, delete_tail_recs = 0;
+       struct ocfs2_extent_list *el = path_leaf_el(left_path);
+       struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+
+       BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
+
+       if (ctxt->c_split_covers_rec) {
+               delete_tail_recs++;
+
+               if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
+                   ctxt->c_has_empty_extent)
+                       delete_tail_recs++;
+
+               if (ctxt->c_has_empty_extent) {
+                       /*
+                        * The merge code will need to create an empty
+                        * extent to take the place of the newly
+                        * emptied slot. Remove any pre-existing empty
+                        * extents - having more than one in a leaf is
+                        * illegal.
+                        */
+                       ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                                    dealloc);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+                       split_index--;
+                       rec = &el->l_recs[split_index];
+               }
+       }
+
+       if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+               /*
+                * Left-right contig implies this.
+                */
+               BUG_ON(!ctxt->c_split_covers_rec);
+               BUG_ON(split_index == 0);
+
+               /*
+                * Since the leftright insert always covers the entire
+                * extent, this call will delete the insert record
+                * entirely, resulting in an empty extent record added to
+                * the extent block.
+                *
+                * Since the adding of an empty extent shifts
+                * everything back to the right, there's no need to
+                * update split_index here.
+                */
+               ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
+                                          handle, split_rec, el, split_index);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               /*
+                * We can only get this from logic error above.
+                */
+               BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+               /*
+                * The left merge left us with an empty extent, remove
+                * it.
+                */
+               ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               split_index--;
+               rec = &el->l_recs[split_index];
+
+               /*
+                * Note that we don't pass split_rec here on purpose -
+                * we've merged it into the left side.
+                */
+               ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
+                                           handle, rec, el, split_index);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+               ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                            dealloc);
+               /*
+                * Error from this last rotate is not critical, so
+                * print but don't bubble it up.
+                */
+               if (ret)
+                       mlog_errno(ret);
+               ret = 0;
+       } else {
+               /*
+                * Merge a record to the left or right.
+                *
+                * 'contig_type' is relative to the existing record,
+                * so for example, if we're "right contig", it's to
+                * the record on the left (hence the left merge).
+                */
+               if (ctxt->c_contig_type == CONTIG_RIGHT) {
+                       ret = ocfs2_merge_rec_left(inode,
+                                                  path_leaf_bh(left_path),
+                                                  handle, split_rec, el,
+                                                  split_index);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+               } else {
+                       ret = ocfs2_merge_rec_right(inode,
+                                                   path_leaf_bh(left_path),
+                                                   handle, split_rec, el,
+                                                   split_index);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+               }
+
+               if (ctxt->c_split_covers_rec) {
+                       /*
+                        * The merge may have left an empty extent in
+                        * our leaf. Try to rotate it away.
+                        */
+                       ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                                    dealloc);
+                       if (ret)
+                               mlog_errno(ret);
+                       ret = 0;
+               }
+       }
+
+out:
+       return ret;
+}
+
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+                                   enum ocfs2_split_type split,
+                                   struct ocfs2_extent_rec *rec,
+                                   struct ocfs2_extent_rec *split_rec)
+{
+       u64 len_blocks;
+
+       len_blocks = ocfs2_clusters_to_blocks(sb,
+                               le16_to_cpu(split_rec->e_leaf_clusters));
+
+       if (split == SPLIT_LEFT) {
+               /*
+                * Region is on the left edge of the existing
+                * record.
+                */
+               le32_add_cpu(&rec->e_cpos,
+                            le16_to_cpu(split_rec->e_leaf_clusters));
+               le64_add_cpu(&rec->e_blkno, len_blocks);
+               le16_add_cpu(&rec->e_leaf_clusters,
+                            -le16_to_cpu(split_rec->e_leaf_clusters));
+       } else {
+               /*
+                * Region is on the right edge of the existing
+                * record.
+                */
+               le16_add_cpu(&rec->e_leaf_clusters,
+                            -le16_to_cpu(split_rec->e_leaf_clusters));
+       }
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+                                struct ocfs2_extent_list *el,
+                                struct ocfs2_insert_type *insert,
+                                struct inode *inode)
+{
+       int i = insert->ins_contig_index;
+       unsigned int range;
+       struct ocfs2_extent_rec *rec;
+
+       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+       if (insert->ins_split != SPLIT_NONE) {
+               i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+               BUG_ON(i == -1);
+               rec = &el->l_recs[i];
+               ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
+                                       insert_rec);
+               goto rotate;
+       }
+
+       /*
+        * Contiguous insert - either left or right.
+        */
+       if (insert->ins_contig != CONTIG_NONE) {
+               rec = &el->l_recs[i];
+               if (insert->ins_contig == CONTIG_LEFT) {
+                       rec->e_blkno = insert_rec->e_blkno;
+                       rec->e_cpos = insert_rec->e_cpos;
+               }
+               le16_add_cpu(&rec->e_leaf_clusters,
+                            le16_to_cpu(insert_rec->e_leaf_clusters));
+               return;
+       }
+
+       /*
+        * Handle insert into an empty leaf.
+        */
+       if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+           ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+            ocfs2_is_empty_extent(&el->l_recs[0]))) {
+               el->l_recs[0] = *insert_rec;
+               el->l_next_free_rec = cpu_to_le16(1);
+               return;
+       }
+
+       /*
+        * Appending insert.
+        */
+       if (insert->ins_appending == APPEND_TAIL) {
+               i = le16_to_cpu(el->l_next_free_rec) - 1;
+               rec = &el->l_recs[i];
+               range = le32_to_cpu(rec->e_cpos)
+                       + le16_to_cpu(rec->e_leaf_clusters);
+               BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+
+               mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+                               le16_to_cpu(el->l_count),
+                               "inode %lu, depth %u, count %u, next free %u, "
+                               "rec.cpos %u, rec.clusters %u, "
+                               "insert.cpos %u, insert.clusters %u\n",
+                               inode->i_ino,
+                               le16_to_cpu(el->l_tree_depth),
+                               le16_to_cpu(el->l_count),
+                               le16_to_cpu(el->l_next_free_rec),
+                               le32_to_cpu(el->l_recs[i].e_cpos),
+                               le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+                               le32_to_cpu(insert_rec->e_cpos),
+                               le16_to_cpu(insert_rec->e_leaf_clusters));
+               i++;
+               el->l_recs[i] = *insert_rec;
+               le16_add_cpu(&el->l_next_free_rec, 1);
+               return;
+       }
+
+rotate:
+       /*
+        * Ok, we have to rotate.
+        *
+        * At this point, it is safe to assume that inserting into an
+        * empty leaf and appending to a leaf have both been handled
+        * above.
+        *
+        * This leaf needs to have space, either by the empty 1st
+        * extent record, or by virtue of an l_next_rec < l_count.
+        */
+       ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static inline void ocfs2_update_dinode_clusters(struct inode *inode,
+                                               struct ocfs2_dinode *di,
+                                               u32 clusters)
+{
+       le32_add_cpu(&di->i_clusters, clusters);
+       spin_lock(&OCFS2_I(inode)->ip_lock);
+       OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+       spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static void ocfs2_adjust_rightmost_records(struct inode *inode,
+                                          handle_t *handle,
+                                          struct ocfs2_path *path,
+                                          struct ocfs2_extent_rec *insert_rec)
+{
+       int ret, i, next_free;
+       struct buffer_head *bh;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_extent_rec *rec;
+
+       /*
+        * Update everything except the leaf block.
+        */
+       for (i = 0; i < path->p_tree_depth; i++) {
+               bh = path->p_node[i].bh;
+               el = path->p_node[i].el;
+
+               next_free = le16_to_cpu(el->l_next_free_rec);
+               if (next_free == 0) {
+                       ocfs2_error(inode->i_sb,
+                                   "Dinode %llu has a bad extent list",
+                                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                       ret = -EIO;
+                       return;
+               }
+
+               rec = &el->l_recs[next_free - 1];
+
+               rec->e_int_clusters = insert_rec->e_cpos;
+               le32_add_cpu(&rec->e_int_clusters,
+                            le16_to_cpu(insert_rec->e_leaf_clusters));
+               le32_add_cpu(&rec->e_int_clusters,
+                            -le32_to_cpu(rec->e_cpos));
+
+               ret = ocfs2_journal_dirty(handle, bh);
+               if (ret)
+                       mlog_errno(ret);
+
+       }
+}
+
+static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+                                   struct ocfs2_extent_rec *insert_rec,
+                                   struct ocfs2_path *right_path,
+                                   struct ocfs2_path **ret_left_path)
+{
+       int ret, next_free;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_path *left_path = NULL;
+
+       *ret_left_path = NULL;
+
+       /*
+        * This shouldn't happen for non-trees. The extent rec cluster
+        * count manipulation below only works for interior nodes.
+        */
+       BUG_ON(right_path->p_tree_depth == 0);
+
+       /*
+        * If our appending insert is at the leftmost edge of a leaf,
+        * then we might need to update the rightmost records of the
+        * neighboring path.
+        */
+       el = path_leaf_el(right_path);
+       next_free = le16_to_cpu(el->l_next_free_rec);
+       if (next_free == 0 ||
+           (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+               u32 left_cpos;
+
+               ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+                                                   &left_cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               mlog(0, "Append may need a left path update. cpos: %u, "
+                    "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+                    left_cpos);
+
+               /*
+                * No need to worry if the append is already in the
+                * leftmost leaf.
+                */
+               if (left_cpos) {
+                       left_path = ocfs2_new_path(path_root_bh(right_path),
+                                                  path_root_el(right_path));
+                       if (!left_path) {
+                               ret = -ENOMEM;
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       ret = ocfs2_find_path(inode, left_path, left_cpos);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       /*
+                        * ocfs2_insert_path() will pass the left_path to the
+                        * journal for us.
+                        */
+               }
+       }
+
+       ret = ocfs2_journal_access_path(inode, handle, right_path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
+
+       *ret_left_path = left_path;
+       ret = 0;
+out:
+       if (ret != 0)
+               ocfs2_free_path(left_path);
+
+       return ret;
+}
+
+static void ocfs2_split_record(struct inode *inode,
+                              struct ocfs2_path *left_path,
+                              struct ocfs2_path *right_path,
+                              struct ocfs2_extent_rec *split_rec,
+                              enum ocfs2_split_type split)
+{
+       int index;
+       u32 cpos = le32_to_cpu(split_rec->e_cpos);
+       struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+       struct ocfs2_extent_rec *rec, *tmprec;
+
+       right_el = path_leaf_el(right_path);;
+       if (left_path)
+               left_el = path_leaf_el(left_path);
+
+       el = right_el;
+       insert_el = right_el;
+       index = ocfs2_search_extent_list(el, cpos);
+       if (index != -1) {
+               if (index == 0 && left_path) {
+                       BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+
+                       /*
+                        * This typically means that the record
+                        * started in the left path but moved to the
+                        * right as a result of rotation. We either
+                        * move the existing record to the left, or we
+                        * do the later insert there.
+                        *
+                        * In this case, the left path should always
+                        * exist as the rotate code will have passed
+                        * it back for a post-insert update.
+                        */
+
+                       if (split == SPLIT_LEFT) {
+                               /*
+                                * It's a left split. Since we know
+                                * that the rotate code gave us an
+                                * empty extent in the left path, we
+                                * can just do the insert there.
+                                */
+                               insert_el = left_el;
+                       } else {
+                               /*
+                                * Right split - we have to move the
+                                * existing record over to the left
+                                * leaf. The insert will be into the
+                                * newly created empty extent in the
+                                * right leaf.
+                                */
+                               tmprec = &right_el->l_recs[index];
+                               ocfs2_rotate_leaf(left_el, tmprec);
+                               el = left_el;
+
+                               memset(tmprec, 0, sizeof(*tmprec));
+                               index = ocfs2_search_extent_list(left_el, cpos);
+                               BUG_ON(index == -1);
+                       }
+               }
+       } else {
+               BUG_ON(!left_path);
+               BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+               /*
+                * Left path is easy - we can just allow the insert to
+                * happen.
+                */
+               el = left_el;
+               insert_el = left_el;
+               index = ocfs2_search_extent_list(el, cpos);
+               BUG_ON(index == -1);
+       }
+
+       rec = &el->l_recs[index];
+       ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
+       ocfs2_rotate_leaf(insert_el, split_rec);
+}
+
+/*
+ * This function only does inserts on an allocation b-tree. For dinode
+ * lists, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(struct inode *inode,
+                            handle_t *handle,
+                            struct ocfs2_path *left_path,
+                            struct ocfs2_path *right_path,
+                            struct ocfs2_extent_rec *insert_rec,
+                            struct ocfs2_insert_type *insert)
+{
+       int ret, subtree_index;
+       struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+
+       /*
+        * Pass both paths to the journal. The majority of inserts
+        * will be touching all components anyway.
+        */
+       ret = ocfs2_journal_access_path(inode, handle, right_path);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (left_path) {
+               int credits = handle->h_buffer_credits;
+
+               /*
+                * There's a chance that left_path got passed back to
+                * us without being accounted for in the
+                * journal. Extend our transaction here to be sure we
+                * can change those blocks.
+                */
+               credits += left_path->p_tree_depth;
+
+               ret = ocfs2_extend_trans(handle, credits);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               ret = ocfs2_journal_access_path(inode, handle, left_path);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       if (insert->ins_split != SPLIT_NONE) {
+               /*
+                * We could call ocfs2_insert_at_leaf() for some types
+                * of splits, but it's easier to just let one seperate
+                * function sort it all out.
+                */
+               ocfs2_split_record(inode, left_path, right_path,
+                                  insert_rec, insert->ins_split);
+       } else
+               ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
+                                    insert, inode);
+
+       ret = ocfs2_journal_dirty(handle, leaf_bh);
+       if (ret)
+               mlog_errno(ret);
+
+       if (left_path) {
+               /*
+                * The rotate code has indicated that we need to fix
+                * up portions of the tree after the insert.
+                *
+                * XXX: Should we extend the transaction here?
+                */
+               subtree_index = ocfs2_find_subtree_root(inode, left_path,
+                                                       right_path);
+               ocfs2_complete_edge_insert(inode, handle, left_path,
+                                          right_path, subtree_index);
+       }
+
+       ret = 0;
+out:
+       return ret;
+}
+
+static int ocfs2_do_insert_extent(struct inode *inode,
+                                 handle_t *handle,
+                                 struct buffer_head *di_bh,
+                                 struct ocfs2_extent_rec *insert_rec,
+                                 struct ocfs2_insert_type *type)
+{
+       int ret, rotate = 0;
+       u32 cpos;
+       struct ocfs2_path *right_path = NULL;
+       struct ocfs2_path *left_path = NULL;
+       struct ocfs2_dinode *di;
+       struct ocfs2_extent_list *el;
+
+       di = (struct ocfs2_dinode *) di_bh->b_data;
+       el = &di->id2.i_list;
+
+       ret = ocfs2_journal_access(handle, inode, di_bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (le16_to_cpu(el->l_tree_depth) == 0) {
+               ocfs2_insert_at_leaf(insert_rec, el, type, inode);
+               goto out_update_clusters;
+       }
+
+       right_path = ocfs2_new_inode_path(di_bh);
+       if (!right_path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /*
+        * Determine the path to start with. Rotations need the
+        * rightmost path, everything else can go directly to the
+        * target leaf.
+        */
+       cpos = le32_to_cpu(insert_rec->e_cpos);
+       if (type->ins_appending == APPEND_NONE &&
+           type->ins_contig == CONTIG_NONE) {
+               rotate = 1;
+               cpos = UINT_MAX;
+       }
+
+       ret = ocfs2_find_path(inode, right_path, cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /*
+        * Rotations and appends need special treatment - they modify
+        * parts of the tree's above them.
+        *
+        * Both might pass back a path immediate to the left of the
+        * one being inserted to. This will be cause
+        * ocfs2_insert_path() to modify the rightmost records of
+        * left_path to account for an edge insert.
+        *
+        * XXX: When modifying this code, keep in mind that an insert
+        * can wind up skipping both of these two special cases...
+        */
+       if (rotate) {
+               ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
+                                             le32_to_cpu(insert_rec->e_cpos),
+                                             right_path, &left_path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       } else if (type->ins_appending == APPEND_TAIL
+                  && type->ins_contig != CONTIG_LEFT) {
+               ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
+                                              right_path, &left_path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       ret = ocfs2_insert_path(inode, handle, left_path, right_path,
+                               insert_rec, type);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+out_update_clusters:
+       if (type->ins_split == SPLIT_NONE)
+               ocfs2_update_dinode_clusters(inode, di,
+                                            le16_to_cpu(insert_rec->e_leaf_clusters));
+
+       ret = ocfs2_journal_dirty(handle, di_bh);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       ocfs2_free_path(left_path);
+       ocfs2_free_path(right_path);
+
+       return ret;
+}
+
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct inode *inode,
+                              struct ocfs2_extent_list *el, int index,
+                              struct ocfs2_extent_rec *split_rec)
+{
+       struct ocfs2_extent_rec *rec;
+       enum ocfs2_contig_type ret = CONTIG_NONE;
+
+       /*
+        * We're careful to check for an empty extent record here -
+        * the merge code will know what to do if it sees one.
+        */
+
+       if (index > 0) {
+               rec = &el->l_recs[index - 1];
+               if (index == 1 && ocfs2_is_empty_extent(rec)) {
+                       if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+                               ret = CONTIG_RIGHT;
+               } else {
+                       ret = ocfs2_extent_contig(inode, rec, split_rec);
+               }
+       }
+
+       if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+               enum ocfs2_contig_type contig_type;
+
+               rec = &el->l_recs[index + 1];
+               contig_type = ocfs2_extent_contig(inode, rec, split_rec);
+
+               if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+                       ret = CONTIG_LEFTRIGHT;
+               else if (ret == CONTIG_NONE)
+                       ret = contig_type;
+       }
+
+       return ret;
+}
+
+static void ocfs2_figure_contig_type(struct inode *inode,
+                                    struct ocfs2_insert_type *insert,
+                                    struct ocfs2_extent_list *el,
+                                    struct ocfs2_extent_rec *insert_rec)
+{
+       int i;
+       enum ocfs2_contig_type contig_type = CONTIG_NONE;
+
+       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+       for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+               contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
+                                                 insert_rec);
+               if (contig_type != CONTIG_NONE) {
+                       insert->ins_contig_index = i;
+                       break;
+               }
+       }
+       insert->ins_contig = contig_type;
+}
+
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the dinode list for tree's with 0
+ * depth. If we consider the dinode list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+                                       struct ocfs2_extent_list *el,
+                                       struct ocfs2_extent_rec *insert_rec)
+{
+       int i;
+       u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+       struct ocfs2_extent_rec *rec;
+
+       insert->ins_appending = APPEND_NONE;
+
+       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+       if (!el->l_next_free_rec)
+               goto set_tail_append;
+
+       if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+               /* Were all records empty? */
+               if (le16_to_cpu(el->l_next_free_rec) == 1)
+                       goto set_tail_append;
+       }
+
+       i = le16_to_cpu(el->l_next_free_rec) - 1;
+       rec = &el->l_recs[i];
+
+       if (cpos >=
+           (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
+               goto set_tail_append;
+
+       return;
+
+set_tail_append:
+       insert->ins_appending = APPEND_TAIL;
+}
+
+/*
+ * Helper function called at the begining of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ *   - Whether the new extent is contiguous with an existing one.
+ *   - The current tree depth.
+ *   - Whether the insert is an appending one.
+ *   - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct inode *inode,
+                                   struct buffer_head *di_bh,
+                                   struct buffer_head **last_eb_bh,
+                                   struct ocfs2_extent_rec *insert_rec,
+                                   struct ocfs2_insert_type *insert)
+{
+       int ret;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_path *path = NULL;
+       struct buffer_head *bh = NULL;
+
+       insert->ins_split = SPLIT_NONE;
+
+       el = &di->id2.i_list;
+       insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+
+       if (el->l_tree_depth) {
+               /*
+                * If we have tree depth, we read in the
+                * rightmost extent block ahead of time as
+                * ocfs2_figure_insert_type() and ocfs2_add_branch()
+                * may want it later.
+                */
+               ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                      le64_to_cpu(di->i_last_eb_blk), &bh,
+                                      OCFS2_BH_CACHED, inode);
+               if (ret) {
+                       mlog_exit(ret);
+                       goto out;
+               }
+               eb = (struct ocfs2_extent_block *) bh->b_data;
+               el = &eb->h_list;
+       }
+
+       /*
+        * Unless we have a contiguous insert, we'll need to know if
+        * there is room left in our allocation tree for another
+        * extent record.
+        *
+        * XXX: This test is simplistic, we can search for empty
+        * extent records too.
+        */
+       insert->ins_free_records = le16_to_cpu(el->l_count) -
+               le16_to_cpu(el->l_next_free_rec);
+
+       if (!insert->ins_tree_depth) {
+               ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+               ocfs2_figure_appending_type(insert, el, insert_rec);
+               return 0;
+       }
+
+       path = ocfs2_new_inode_path(di_bh);
+       if (!path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /*
+        * In the case that we're inserting past what the tree
+        * currently accounts for, ocfs2_find_path() will return for
+        * us the rightmost tree path. This is accounted for below in
+        * the appending code.
+        */
+       ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       el = path_leaf_el(path);
+
+       /*
+        * Now that we have the path, there's two things we want to determine:
+        * 1) Contiguousness (also set contig_index if this is so)
+        *
+        * 2) Are we doing an append? We can trivially break this up
+         *     into two types of appends: simple record append, or a
+         *     rotate inside the tail leaf.
+        */
+       ocfs2_figure_contig_type(inode, insert, el, insert_rec);
 
        /*
         * The insert code isn't quite ready to deal with all cases of
@@ -2295,143 +3694,819 @@ static int ocfs2_figure_insert_type(struct inode *inode,
         */
        if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
                /*
-                * Ok, ocfs2_find_path() returned us the rightmost
-                * tree path. This might be an appending insert. There are
-                * two cases:
-                *    1) We're doing a true append at the tail:
-                *      -This might even be off the end of the leaf
-                *    2) We're "appending" by rotating in the tail
+                * Ok, ocfs2_find_path() returned us the rightmost
+                * tree path. This might be an appending insert. There are
+                * two cases:
+                *    1) We're doing a true append at the tail:
+                *      -This might even be off the end of the leaf
+                *    2) We're "appending" by rotating in the tail
+                */
+               ocfs2_figure_appending_type(insert, el, insert_rec);
+       }
+
+out:
+       ocfs2_free_path(path);
+
+       if (ret == 0)
+               *last_eb_bh = bh;
+       else
+               brelse(bh);
+       return ret;
+}
+
+/*
+ * Insert an extent into an inode btree.
+ *
+ * The caller needs to update fe->i_clusters
+ */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+                       handle_t *handle,
+                       struct inode *inode,
+                       struct buffer_head *fe_bh,
+                       u32 cpos,
+                       u64 start_blk,
+                       u32 new_clusters,
+                       u8 flags,
+                       struct ocfs2_alloc_context *meta_ac)
+{
+       int status;
+       struct buffer_head *last_eb_bh = NULL;
+       struct buffer_head *bh = NULL;
+       struct ocfs2_insert_type insert = {0, };
+       struct ocfs2_extent_rec rec;
+
+       mlog(0, "add %u clusters at position %u to inode %llu\n",
+            new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+       mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+                       (OCFS2_I(inode)->ip_clusters != cpos),
+                       "Device %s, asking for sparse allocation: inode %llu, "
+                       "cpos %u, clusters %u\n",
+                       osb->dev_str,
+                       (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
+                       OCFS2_I(inode)->ip_clusters);
+
+       memset(&rec, 0, sizeof(rec));
+       rec.e_cpos = cpu_to_le32(cpos);
+       rec.e_blkno = cpu_to_le64(start_blk);
+       rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+       rec.e_flags = flags;
+
+       status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+                                         &insert);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
+            "Insert.contig_index: %d, Insert.free_records: %d, "
+            "Insert.tree_depth: %d\n",
+            insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
+            insert.ins_free_records, insert.ins_tree_depth);
+
+       if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
+               status = ocfs2_grow_tree(inode, handle, fe_bh,
+                                        &insert.ins_tree_depth, &last_eb_bh,
+                                        meta_ac);
+               if (status) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+       /* Finally, we can add clusters. This might rotate the tree for us. */
+       status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
+       if (status < 0)
+               mlog_errno(status);
+       else
+               ocfs2_extent_map_insert_rec(inode, &rec);
+
+bail:
+       if (bh)
+               brelse(bh);
+
+       if (last_eb_bh)
+               brelse(last_eb_bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+                                      struct ocfs2_extent_rec *split_rec,
+                                      u32 cpos,
+                                      struct ocfs2_extent_rec *rec)
+{
+       u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+       u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+
+       memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+
+       split_rec->e_cpos = cpu_to_le32(cpos);
+       split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+
+       split_rec->e_blkno = rec->e_blkno;
+       le64_add_cpu(&split_rec->e_blkno,
+                    ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+
+       split_rec->e_flags = rec->e_flags;
+}
+
+static int ocfs2_split_and_insert(struct inode *inode,
+                                 handle_t *handle,
+                                 struct ocfs2_path *path,
+                                 struct buffer_head *di_bh,
+                                 struct buffer_head **last_eb_bh,
+                                 int split_index,
+                                 struct ocfs2_extent_rec *orig_split_rec,
+                                 struct ocfs2_alloc_context *meta_ac)
+{
+       int ret = 0, depth;
+       unsigned int insert_range, rec_range, do_leftright = 0;
+       struct ocfs2_extent_rec tmprec;
+       struct ocfs2_extent_list *rightmost_el;
+       struct ocfs2_extent_rec rec;
+       struct ocfs2_extent_rec split_rec = *orig_split_rec;
+       struct ocfs2_insert_type insert;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_dinode *di;
+
+leftright:
+       /*
+        * Store a copy of the record on the stack - it might move
+        * around as the tree is manipulated below.
+        */
+       rec = path_leaf_el(path)->l_recs[split_index];
+
+       di = (struct ocfs2_dinode *)di_bh->b_data;
+       rightmost_el = &di->id2.i_list;
+
+       depth = le16_to_cpu(rightmost_el->l_tree_depth);
+       if (depth) {
+               BUG_ON(!(*last_eb_bh));
+               eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+               rightmost_el = &eb->h_list;
+       }
+
+       if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+           le16_to_cpu(rightmost_el->l_count)) {
+               int old_depth = depth;
+
+               ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
+                                     meta_ac);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (old_depth != depth) {
+                       eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+                       rightmost_el = &eb->h_list;
+               }
+       }
+
+       memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+       insert.ins_appending = APPEND_NONE;
+       insert.ins_contig = CONTIG_NONE;
+       insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+               - le16_to_cpu(rightmost_el->l_next_free_rec);
+       insert.ins_tree_depth = depth;
+
+       insert_range = le32_to_cpu(split_rec.e_cpos) +
+               le16_to_cpu(split_rec.e_leaf_clusters);
+       rec_range = le32_to_cpu(rec.e_cpos) +
+               le16_to_cpu(rec.e_leaf_clusters);
+
+       if (split_rec.e_cpos == rec.e_cpos) {
+               insert.ins_split = SPLIT_LEFT;
+       } else if (insert_range == rec_range) {
+               insert.ins_split = SPLIT_RIGHT;
+       } else {
+               /*
+                * Left/right split. We fake this as a right split
+                * first and then make a second pass as a left split.
                 */
-               ocfs2_figure_appending_type(insert, el, insert_rec);
+               insert.ins_split = SPLIT_RIGHT;
+
+               ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
+                                          &rec);
+
+               split_rec = tmprec;
+
+               BUG_ON(do_leftright);
+               do_leftright = 1;
+       }
+
+       ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
+                                    &insert);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
        }
 
+       if (do_leftright == 1) {
+               u32 cpos;
+               struct ocfs2_extent_list *el;
+
+               do_leftright++;
+               split_rec = *orig_split_rec;
+
+               ocfs2_reinit_path(path, 1);
+
+               cpos = le32_to_cpu(split_rec.e_cpos);
+               ret = ocfs2_find_path(inode, path, cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               el = path_leaf_el(path);
+               split_index = ocfs2_search_extent_list(el, cpos);
+               goto leftright;
+       }
 out:
-       ocfs2_free_path(path);
 
-       if (ret == 0)
-               *last_eb_bh = bh;
+       return ret;
+}
+
+/*
+ * Mark part or all of the extent record at split_index in the leaf
+ * pointed to by path as written. This removes the unwritten
+ * extent flag.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any inode with a
+ * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+static int __ocfs2_mark_extent_written(struct inode *inode,
+                                      struct buffer_head *di_bh,
+                                      handle_t *handle,
+                                      struct ocfs2_path *path,
+                                      int split_index,
+                                      struct ocfs2_extent_rec *split_rec,
+                                      struct ocfs2_alloc_context *meta_ac,
+                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+       int ret = 0;
+       struct ocfs2_extent_list *el = path_leaf_el(path);
+       struct buffer_head *eb_bh, *last_eb_bh = NULL;
+       struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+       struct ocfs2_merge_ctxt ctxt;
+       struct ocfs2_extent_list *rightmost_el;
+
+       if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
+               ret = -EIO;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+           ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+            (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+               ret = -EIO;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       eb_bh = path_leaf_bh(path);
+       ret = ocfs2_journal_access(handle, inode, eb_bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+                                                           split_index,
+                                                           split_rec);
+
+       /*
+        * The core merge / split code wants to know how much room is
+        * left in this inodes allocation tree, so we pass the
+        * rightmost extent list.
+        */
+       if (path->p_tree_depth) {
+               struct ocfs2_extent_block *eb;
+               struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+               ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                      le64_to_cpu(di->i_last_eb_blk),
+                                      &last_eb_bh, OCFS2_BH_CACHED, inode);
+               if (ret) {
+                       mlog_exit(ret);
+                       goto out;
+               }
+
+               eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                       ret = -EROFS;
+                       goto out;
+               }
+
+               rightmost_el = &eb->h_list;
+       } else
+               rightmost_el = path_root_el(path);
+
+       ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
+       if (ctxt.c_used_tail_recs > 0 &&
+           ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
+               ctxt.c_used_tail_recs--;
+
+       if (rec->e_cpos == split_rec->e_cpos &&
+           rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+               ctxt.c_split_covers_rec = 1;
        else
-               brelse(bh);
+               ctxt.c_split_covers_rec = 0;
+
+       ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+       mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
+            "has_empty: %u, split_covers: %u\n", split_index,
+            ctxt.c_contig_type, ctxt.c_used_tail_recs,
+            ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
+
+       if (ctxt.c_contig_type == CONTIG_NONE) {
+               if (ctxt.c_split_covers_rec)
+                       el->l_recs[split_index] = *split_rec;
+               else
+                       ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+                                                    &last_eb_bh, split_index,
+                                                    split_rec, meta_ac);
+               if (ret)
+                       mlog_errno(ret);
+       } else {
+               ret = ocfs2_try_to_merge_extent(inode, handle, path,
+                                               split_index, split_rec,
+                                               dealloc, &ctxt);
+               if (ret)
+                       mlog_errno(ret);
+       }
+
+       ocfs2_journal_dirty(handle, eb_bh);
+
+out:
+       brelse(last_eb_bh);
        return ret;
 }
 
 /*
- * Insert an extent into an inode btree.
+ * Mark the already-existing extent at cpos as written for len clusters.
  *
- * The caller needs to update fe->i_clusters
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
  */
-int ocfs2_insert_extent(struct ocfs2_super *osb,
-                       handle_t *handle,
-                       struct inode *inode,
-                       struct buffer_head *fe_bh,
-                       u32 cpos,
-                       u64 start_blk,
-                       u32 new_clusters,
-                       struct ocfs2_alloc_context *meta_ac)
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+                             handle_t *handle, u32 cpos, u32 len, u32 phys,
+                             struct ocfs2_alloc_context *meta_ac,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+       int ret, index;
+       u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+       struct ocfs2_extent_rec split_rec;
+       struct ocfs2_path *left_path = NULL;
+       struct ocfs2_extent_list *el;
+
+       mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
+            inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
+
+       if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+               ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+                           "that are being written to, but the feature bit "
+                           "is not set in the super block.",
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno);
+               ret = -EROFS;
+               goto out;
+       }
+
+       /*
+        * XXX: This should be fixed up so that we just re-insert the
+        * next extent records.
+        */
+       ocfs2_extent_map_trunc(inode, 0);
+
+       left_path = ocfs2_new_inode_path(di_bh);
+       if (!left_path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_find_path(inode, left_path, cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+       el = path_leaf_el(left_path);
+
+       index = ocfs2_search_extent_list(el, cpos);
+       if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+               ocfs2_error(inode->i_sb,
+                           "Inode %llu has an extent at cpos %u which can no "
+                           "longer be found.\n",
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+               ret = -EROFS;
+               goto out;
+       }
+
+       memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+       split_rec.e_cpos = cpu_to_le32(cpos);
+       split_rec.e_leaf_clusters = cpu_to_le16(len);
+       split_rec.e_blkno = cpu_to_le64(start_blkno);
+       split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
+       split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
+
+       ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
+                                         index, &split_rec, meta_ac, dealloc);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       ocfs2_free_path(left_path);
+       return ret;
+}
+
+static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+                           handle_t *handle, struct ocfs2_path *path,
+                           int index, u32 new_range,
+                           struct ocfs2_alloc_context *meta_ac)
 {
-       int status, shift;
+       int ret, depth, credits = handle->h_buffer_credits;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct buffer_head *last_eb_bh = NULL;
-       struct buffer_head *bh = NULL;
-       struct ocfs2_insert_type insert = {0, };
-       struct ocfs2_extent_rec rec;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list *rightmost_el, *el;
+       struct ocfs2_extent_rec split_rec;
+       struct ocfs2_extent_rec *rec;
+       struct ocfs2_insert_type insert;
+
+       /*
+        * Setup the record to split before we grow the tree.
+        */
+       el = path_leaf_el(path);
+       rec = &el->l_recs[index];
+       ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
+
+       depth = path->p_tree_depth;
+       if (depth > 0) {
+               ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                      le64_to_cpu(di->i_last_eb_blk),
+                                      &last_eb_bh, OCFS2_BH_CACHED, inode);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+               rightmost_el = &eb->h_list;
+       } else
+               rightmost_el = path_leaf_el(path);
+
+       credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+       ret = ocfs2_extend_trans(handle, credits);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+           le16_to_cpu(rightmost_el->l_count)) {
+               int old_depth = depth;
+
+               ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+                                     meta_ac);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (old_depth != depth) {
+                       eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+                       rightmost_el = &eb->h_list;
+               }
+       }
+
+       memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+       insert.ins_appending = APPEND_NONE;
+       insert.ins_contig = CONTIG_NONE;
+       insert.ins_split = SPLIT_RIGHT;
+       insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+               - le16_to_cpu(rightmost_el->l_next_free_rec);
+       insert.ins_tree_depth = depth;
+
+       ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       brelse(last_eb_bh);
+       return ret;
+}
+
+static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
+                             struct ocfs2_path *path, int index,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u32 cpos, u32 len)
+{
+       int ret;
+       u32 left_cpos, rec_range, trunc_range;
+       int wants_rotate = 0, is_rightmost_tree_rec = 0;
+       struct super_block *sb = inode->i_sb;
+       struct ocfs2_path *left_path = NULL;
+       struct ocfs2_extent_list *el = path_leaf_el(path);
+       struct ocfs2_extent_rec *rec;
+       struct ocfs2_extent_block *eb;
+
+       if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+               ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               index--;
+       }
+
+       if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+           path->p_tree_depth) {
+               /*
+                * Check whether this is the rightmost tree record. If
+                * we remove all of this record or part of its right
+                * edge then an update of the record lengths above it
+                * will be required.
+                */
+               eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+               if (eb->h_next_leaf_blk == 0)
+                       is_rightmost_tree_rec = 1;
+       }
+
+       rec = &el->l_recs[index];
+       if (index == 0 && path->p_tree_depth &&
+           le32_to_cpu(rec->e_cpos) == cpos) {
+               /*
+                * Changing the leftmost offset (via partial or whole
+                * record truncate) of an interior (or rightmost) path
+                * means we have to update the subtree that is formed
+                * by this leaf and the one to it's left.
+                *
+                * There are two cases we can skip:
+                *   1) Path is the leftmost one in our inode tree.
+                *   2) The leaf is rightmost and will be empty after
+                *      we remove the extent record - the rotate code
+                *      knows how to update the newly formed edge.
+                */
+
+               ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
+                                                   &left_cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+                       left_path = ocfs2_new_path(path_root_bh(path),
+                                                  path_root_el(path));
+                       if (!left_path) {
+                               ret = -ENOMEM;
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       ret = ocfs2_find_path(inode, left_path, left_cpos);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+               }
+       }
+
+       ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                             handle->h_buffer_credits,
+                                             path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_journal_access_path(inode, handle, path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_journal_access_path(inode, handle, left_path);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+       trunc_range = cpos + len;
+
+       if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+               int next_free;
+
+               memset(rec, 0, sizeof(*rec));
+               ocfs2_cleanup_merge(el, index);
+               wants_rotate = 1;
+
+               next_free = le16_to_cpu(el->l_next_free_rec);
+               if (is_rightmost_tree_rec && next_free > 1) {
+                       /*
+                        * We skip the edge update if this path will
+                        * be deleted by the rotate code.
+                        */
+                       rec = &el->l_recs[next_free - 1];
+                       ocfs2_adjust_rightmost_records(inode, handle, path,
+                                                      rec);
+               }
+       } else if (le32_to_cpu(rec->e_cpos) == cpos) {
+               /* Remove leftmost portion of the record. */
+               le32_add_cpu(&rec->e_cpos, len);
+               le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+               le16_add_cpu(&rec->e_leaf_clusters, -len);
+       } else if (rec_range == trunc_range) {
+               /* Remove rightmost portion of the record */
+               le16_add_cpu(&rec->e_leaf_clusters, -len);
+               if (is_rightmost_tree_rec)
+                       ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+       } else {
+               /* Caller should have trapped this. */
+               mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
+                    "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                    le32_to_cpu(rec->e_cpos),
+                    le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+               BUG();
+       }
+
+       if (left_path) {
+               int subtree_index;
+
+               subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+               ocfs2_complete_edge_insert(inode, handle, left_path, path,
+                                          subtree_index);
+       }
+
+       ocfs2_journal_dirty(handle, path_leaf_bh(path));
+
+       ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+out:
+       ocfs2_free_path(left_path);
+       return ret;
+}
 
-       mlog(0, "add %u clusters at position %u to inode %llu\n",
-            new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+                       u32 cpos, u32 len, handle_t *handle,
+                       struct ocfs2_alloc_context *meta_ac,
+                       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+       int ret, index;
+       u32 rec_range, trunc_range;
+       struct ocfs2_extent_rec *rec;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_path *path;
 
-       mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
-                       (OCFS2_I(inode)->ip_clusters != cpos),
-                       "Device %s, asking for sparse allocation: inode %llu, "
-                       "cpos %u, clusters %u\n",
-                       osb->dev_str,
-                       (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
-                       OCFS2_I(inode)->ip_clusters);
+       ocfs2_extent_map_trunc(inode, 0);
 
-       memset(&rec, 0, sizeof(rec));
-       rec.e_cpos = cpu_to_le32(cpos);
-       rec.e_blkno = cpu_to_le64(start_blk);
-       rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+       path = ocfs2_new_inode_path(di_bh);
+       if (!path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
 
-       status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
-                                         &insert);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
+       ret = ocfs2_find_path(inode, path, cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
        }
 
-       mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
-            "Insert.contig_index: %d, Insert.free_records: %d, "
-            "Insert.tree_depth: %d\n",
-            insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
-            insert.ins_free_records, insert.ins_tree_depth);
+       el = path_leaf_el(path);
+       index = ocfs2_search_extent_list(el, cpos);
+       if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+               ocfs2_error(inode->i_sb,
+                           "Inode %llu has an extent at cpos %u which can no "
+                           "longer be found.\n",
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+               ret = -EROFS;
+               goto out;
+       }
 
        /*
-        * Avoid growing the tree unless we're out of records and the
-        * insert type requres one.
+        * We have 3 cases of extent removal:
+        *   1) Range covers the entire extent rec
+        *   2) Range begins or ends on one edge of the extent rec
+        *   3) Range is in the middle of the extent rec (no shared edges)
+        *
+        * For case 1 we remove the extent rec and left rotate to
+        * fill the hole.
+        *
+        * For case 2 we just shrink the existing extent rec, with a
+        * tree update if the shrinking edge is also the edge of an
+        * extent block.
+        *
+        * For case 3 we do a right split to turn the extent rec into
+        * something case 2 can handle.
         */
-       if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
-               goto out_add;
+       rec = &el->l_recs[index];
+       rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+       trunc_range = cpos + len;
 
-       shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
-       if (shift < 0) {
-               status = shift;
-               mlog_errno(status);
-               goto bail;
-       }
+       BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
 
-       /* We traveled all the way to the bottom of the allocation tree
-        * and didn't find room for any more extents - we need to add
-        * another tree level */
-       if (shift) {
-               BUG_ON(bh);
-               mlog(0, "need to shift tree depth "
-                    "(current = %d)\n", insert.ins_tree_depth);
+       mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
+            "(cpos %u, len %u)\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
+            le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
 
-               /* ocfs2_shift_tree_depth will return us a buffer with
-                * the new extent block (so we can pass that to
-                * ocfs2_add_branch). */
-               status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
-                                               meta_ac, &bh);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
+       if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+               ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+                                        cpos, len);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       } else {
+               ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+                                      trunc_range, meta_ac);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
                }
-               insert.ins_tree_depth++;
-               /* Special case: we have room now if we shifted from
-                * tree_depth 0 */
-               if (insert.ins_tree_depth == 1)
-                       goto out_add;
-       }
 
-       /* call ocfs2_add_branch to add the final part of the tree with
-        * the new data. */
-       mlog(0, "add branch. bh = %p\n", bh);
-       status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
-                                 meta_ac);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+               /*
+                * The split could have manipulated the tree enough to
+                * move the record location, so we have to look for it again.
+                */
+               ocfs2_reinit_path(path, 1);
 
-out_add:
-       /* Finally, we can add clusters. This might rotate the tree for us. */
-       status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
-       if (status < 0)
-               mlog_errno(status);
-       else
-               ocfs2_extent_map_insert_rec(inode, &rec);
+               ret = ocfs2_find_path(inode, path, cpos);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
 
-bail:
-       if (bh)
-               brelse(bh);
+               el = path_leaf_el(path);
+               index = ocfs2_search_extent_list(el, cpos);
+               if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                       ocfs2_error(inode->i_sb,
+                                   "Inode %llu: split at cpos %u lost record.",
+                                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                   cpos);
+                       ret = -EROFS;
+                       goto out;
+               }
 
-       if (last_eb_bh)
-               brelse(last_eb_bh);
+               /*
+                * Double check our values here. If anything is fishy,
+                * it's easier to catch it at the top level.
+                */
+               rec = &el->l_recs[index];
+               rec_range = le32_to_cpu(rec->e_cpos) +
+                       ocfs2_rec_clusters(el, rec);
+               if (rec_range != trunc_range) {
+                       ocfs2_error(inode->i_sb,
+                                   "Inode %llu: error after split at cpos %u"
+                                   "trunc len %u, existing record is (%u,%u)",
+                                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                   cpos, len, le32_to_cpu(rec->e_cpos),
+                                   ocfs2_rec_clusters(el, rec));
+                       ret = -EROFS;
+                       goto out;
+               }
 
-       mlog_exit(status);
-       return status;
+               ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+                                        cpos, len);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+out:
+       ocfs2_free_path(path);
+       return ret;
 }
 
-static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
        struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
        return current_tail == new_start;
 }
 
-static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
-                                    handle_t *handle,
-                                    u64 start_blk,
-                                    unsigned int num_clusters)
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+                             handle_t *handle,
+                             u64 start_blk,
+                             unsigned int num_clusters)
 {
        int status, index;
        unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
 }
 
 /* Expects you to already be holding tl_inode->i_mutex */
-static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
        int status;
        unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
        return status;
 }
 
+/*
+ * Delayed de-allocation of suballocator blocks.
+ *
+ * Some sets of block de-allocations might involve multiple suballocator inodes.
+ *
+ * The locking for this can get extremely complicated, especially when
+ * the suballocator inodes to delete from aren't known until deep
+ * within an unrelated codepath.
+ *
+ * ocfs2_extent_block structures are a good example of this - an inode
+ * btree could have been grown by any number of nodes each allocating
+ * out of their own suballoc inode.
+ *
+ * These structures allow the delay of block de-allocation until a
+ * later time, when locking of multiple cluster inodes won't cause
+ * deadlock.
+ */
+
+/*
+ * Describes a single block free from a suballocator
+ */
+struct ocfs2_cached_block_free {
+       struct ocfs2_cached_block_free          *free_next;
+       u64                                     free_blk;
+       unsigned int                            free_bit;
+};
+
+struct ocfs2_per_slot_free_list {
+       struct ocfs2_per_slot_free_list         *f_next_suballocator;
+       int                                     f_inode_type;
+       int                                     f_slot;
+       struct ocfs2_cached_block_free          *f_first;
+};
+
+static int ocfs2_free_cached_items(struct ocfs2_super *osb,
+                                  int sysfile_type,
+                                  int slot,
+                                  struct ocfs2_cached_block_free *head)
+{
+       int ret;
+       u64 bg_blkno;
+       handle_t *handle;
+       struct inode *inode;
+       struct buffer_head *di_bh = NULL;
+       struct ocfs2_cached_block_free *tmp;
+
+       inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
+       if (!inode) {
+               ret = -EINVAL;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       mutex_lock(&inode->i_mutex);
+
+       ret = ocfs2_meta_lock(inode, &di_bh, 1);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_mutex;
+       }
+
+       handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               goto out_unlock;
+       }
+
+       while (head) {
+               bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                                                     head->free_bit);
+               mlog(0, "Free bit: (bit %u, blkno %llu)\n",
+                    head->free_bit, (unsigned long long)head->free_blk);
+
+               ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
+                                              head->free_bit, bg_blkno, 1);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_journal;
+               }
+
+               ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_journal;
+               }
+
+               tmp = head;
+               head = head->free_next;
+               kfree(tmp);
+       }
+
+out_journal:
+       ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+       ocfs2_meta_unlock(inode, 1);
+       brelse(di_bh);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       iput(inode);
+out:
+       while(head) {
+               /* Premature exit may have left some dangling items. */
+               tmp = head;
+               head = head->free_next;
+               kfree(tmp);
+       }
+
+       return ret;
+}
+
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+                      struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+       int ret = 0, ret2;
+       struct ocfs2_per_slot_free_list *fl;
+
+       if (!ctxt)
+               return 0;
+
+       while (ctxt->c_first_suballocator) {
+               fl = ctxt->c_first_suballocator;
+
+               if (fl->f_first) {
+                       mlog(0, "Free items: (type %u, slot %d)\n",
+                            fl->f_inode_type, fl->f_slot);
+                       ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
+                                                      fl->f_slot, fl->f_first);
+                       if (ret2)
+                               mlog_errno(ret2);
+                       if (!ret)
+                               ret = ret2;
+               }
+
+               ctxt->c_first_suballocator = fl->f_next_suballocator;
+               kfree(fl);
+       }
+
+       return ret;
+}
+
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_per_slot_free_list(int type,
+                             int slot,
+                             struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+       struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+
+       while (fl) {
+               if (fl->f_inode_type == type && fl->f_slot == slot)
+                       return fl;
+
+               fl = fl->f_next_suballocator;
+       }
+
+       fl = kmalloc(sizeof(*fl), GFP_NOFS);
+       if (fl) {
+               fl->f_inode_type = type;
+               fl->f_slot = slot;
+               fl->f_first = NULL;
+               fl->f_next_suballocator = ctxt->c_first_suballocator;
+
+               ctxt->c_first_suballocator = fl;
+       }
+       return fl;
+}
+
+static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                    int type, int slot, u64 blkno,
+                                    unsigned int bit)
+{
+       int ret;
+       struct ocfs2_per_slot_free_list *fl;
+       struct ocfs2_cached_block_free *item;
+
+       fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
+       if (fl == NULL) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       item = kmalloc(sizeof(*item), GFP_NOFS);
+       if (item == NULL) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
+            type, slot, bit, (unsigned long long)blkno);
+
+       item->free_blk = blkno;
+       item->free_bit = bit;
+       item->free_next = fl->f_first;
+
+       fl->f_first = item;
+
+       ret = 0;
+out:
+       return ret;
+}
+
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                        struct ocfs2_extent_block *eb)
+{
+       return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+                                        le16_to_cpu(eb->h_suballoc_slot),
+                                        le64_to_cpu(eb->h_blkno),
+                                        le16_to_cpu(eb->h_suballoc_bit));
+}
+
 /* This function will figure out whether the currently last extent
  * block will be deleted, and if it will, what the new last extent
  * block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
 
-                       if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
-                               /*
-                                * This code only understands how to
-                                * lock the suballocator in slot 0,
-                                * which is fine because allocation is
-                                * only ever done out of that
-                                * suballocator too. A future version
-                                * might change that however, so avoid
-                                * a free if we don't know how to
-                                * handle it. This way an fs incompat
-                                * bit will not be necessary.
-                                */
-                               ret = ocfs2_free_extent_block(handle,
-                                                             tc->tc_ext_alloc_inode,
-                                                             tc->tc_ext_alloc_bh,
-                                                             eb);
-
-                               /* An error here is not fatal. */
-                               if (ret < 0)
-                                       mlog_errno(ret);
-                       }
+                       ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
+                       /* An error here is not fatal. */
+                       if (ret < 0)
+                               mlog_errno(ret);
                } else {
                        deleted_eb = 0;
                }
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
        return ocfs2_journal_dirty_data(handle, bh);
 }
 
-static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
-                                    struct page **pages, int numpages,
-                                    u64 phys, handle_t *handle)
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
+                                    loff_t end, struct page **pages,
+                                    int numpages, u64 phys, handle_t *handle)
 {
        int i, ret, partial = 0;
        void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
        if (numpages == 0)
                goto out;
 
-       from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
-       if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
-               /*
-                * Since 'from' has been capped to a value below page
-                * size, this calculation won't be able to overflow
-                * 'to'
-                */
-               to = ocfs2_align_bytes_to_clusters(sb, from);
-
-               /*
-                * The truncate tail in this case should never contain
-                * more than one page at maximum. The loop below also
-                * assumes this.
-                */
-               BUG_ON(numpages != 1);
-       }
-
+       to = PAGE_CACHE_SIZE;
        for(i = 0; i < numpages; i++) {
                page = pages[i];
 
+               from = start & (PAGE_CACHE_SIZE - 1);
+               if ((end >> PAGE_CACHE_SHIFT) == page->index)
+                       to = end & (PAGE_CACHE_SIZE - 1);
+
                BUG_ON(from > PAGE_CACHE_SIZE);
                BUG_ON(to > PAGE_CACHE_SIZE);
 
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
 
                flush_dcache_page(page);
 
-               /*
-                * Every page after the 1st one should be completely zero'd.
-                */
-               from = 0;
+               start = (page->index + 1) << PAGE_CACHE_SHIFT;
        }
 out:
        if (pages) {
@@ -3484,24 +5740,26 @@ out:
        }
 }
 
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
-                               int *num, u64 *phys)
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+                               struct page **pages, int *num, u64 *phys)
 {
        int i, numpages = 0, ret = 0;
-       unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
        unsigned int ext_flags;
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index;
-       u64 next_cluster_bytes;
+       loff_t last_page_bytes;
 
        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+       BUG_ON(start > end);
 
-       /* Cluster boundary, so we don't need to grab any pages. */
-       if ((isize & (csize - 1)) == 0)
+       if (start == end)
                goto out;
 
-       ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+       BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+              (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+       ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
                                          phys, NULL, &ext_flags);
        if (ret) {
                mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
        if (ext_flags & OCFS2_EXT_UNWRITTEN)
                goto out;
 
-       next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
-       index = isize >> PAGE_CACHE_SHIFT;
+       last_page_bytes = PAGE_ALIGN(end);
+       index = start >> PAGE_CACHE_SHIFT;
        do {
                pages[numpages] = grab_cache_page(mapping, index);
                if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
 
                numpages++;
                index++;
-       } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+       } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
 
 out:
        if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
  * otherwise block_write_full_page() will skip writeout of pages past
  * i_size. The new_i_size parameter is passed for this reason.
  */
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
-                                u64 new_i_size)
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 range_start, u64 range_end)
 {
        int ret, numpages;
-       loff_t endbyte;
        struct page **pages = NULL;
        u64 phys;
 
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
                goto out;
        }
 
-       ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+       ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+                                  &numpages, &phys);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
        if (numpages == 0)
                goto out;
 
-       ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
-                                handle);
+       ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
+                                numpages, phys, handle);
 
        /*
         * Initiate writeout of the pages we zero'd here. We don't
         * wait on them - the truncate_inode_pages() call later will
         * do that for us.
         */
-       endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
-       ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
-                                   endbyte - 1, SYNC_FILE_RANGE_WRITE);
+       ret = do_sync_mapping_range(inode->i_mapping, range_start,
+                                   range_end - 1, SYNC_FILE_RANGE_WRITE);
        if (ret)
                mlog_errno(ret);
 
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 
        mlog_entry_void();
 
-       down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
 
@@ -3754,7 +6009,6 @@ start:
        goto start;
 
 bail:
-       up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
        ocfs2_schedule_truncate_log_flush(osb, 1);
 
@@ -3764,6 +6018,8 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
 
+       ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+
        ocfs2_free_path(path);
 
        /* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
 }
 
 /*
- * Expects the inode to already be locked. This will figure out which
- * inodes need to be locked and will put them on the returned truncate
- * context.
+ * Expects the inode to already be locked.
  */
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
                           struct ocfs2_truncate_context **tc)
 {
-       int status, metadata_delete, i;
+       int status;
        unsigned int new_i_clusters;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
-       struct ocfs2_extent_list *el;
        struct buffer_head *last_eb_bh = NULL;
-       struct inode *ext_alloc_inode = NULL;
-       struct buffer_head *ext_alloc_bh = NULL;
 
        mlog_entry_void();
 
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                mlog_errno(status);
                goto bail;
        }
+       ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
-       metadata_delete = 0;
        if (fe->id2.i_list.l_tree_depth) {
-               /* If we have a tree, then the truncate may result in
-                * metadata deletes. Figure this out from the
-                * rightmost leaf block.*/
                status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
                                          &last_eb_bh, OCFS2_BH_CACHED, inode);
                if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                        status = -EIO;
                        goto bail;
                }
-               el = &(eb->h_list);
-
-               i = 0;
-               if (ocfs2_is_empty_extent(&el->l_recs[0]))
-                       i = 1;
-               /*
-                * XXX: Should we check that next_free_rec contains
-                * the extent?
-                */
-               if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
-                       metadata_delete = 1;
        }
 
        (*tc)->tc_last_eb_bh = last_eb_bh;
 
-       if (metadata_delete) {
-               mlog(0, "Will have to delete metadata for this trunc. "
-                    "locking allocator.\n");
-               ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
-               if (!ext_alloc_inode) {
-                       status = -ENOMEM;
-                       mlog_errno(status);
-                       goto bail;
-               }
-
-               mutex_lock(&ext_alloc_inode->i_mutex);
-               (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
-
-               status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-               (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
-               (*tc)->tc_ext_alloc_locked = 1;
-       }
-
        status = 0;
 bail:
        if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 {
-       if (tc->tc_ext_alloc_inode) {
-               if (tc->tc_ext_alloc_locked)
-                       ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
-
-               mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
-               iput(tc->tc_ext_alloc_inode);
-       }
-
-       if (tc->tc_ext_alloc_bh)
-               brelse(tc->tc_ext_alloc_bh);
+       /*
+        * The caller is responsible for completing deallocation
+        * before freeing the context.
+        */
+       if (tc->tc_dealloc.c_first_suballocator != NULL)
+               mlog(ML_NOTICE,
+                    "Truncate completion has non-empty dealloc context\n");
 
        if (tc->tc_last_eb_bh)
                brelse(tc->tc_last_eb_bh);
index fbcb5934a0817e72ecc7bca583a3ae00b1ef1dae..990df48ae8d361459ee27b42cccfb7b0f935c222 100644 (file)
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        u32 cpos,
                        u64 start_blk,
                        u32 new_clusters,
+                       u8 flags,
                        struct ocfs2_alloc_context *meta_ac);
+struct ocfs2_cached_dealloc_ctxt;
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+                             handle_t *handle, u32 cpos, u32 len, u32 phys,
+                             struct ocfs2_alloc_context *meta_ac,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+                       u32 cpos, u32 len, handle_t *handle,
+                       struct ocfs2_alloc_context *meta_ac,
+                       struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct ocfs2_dinode *fe);
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
                                      struct ocfs2_dinode **tl_copy);
 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
                                         struct ocfs2_dinode *tl_copy);
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+                             handle_t *handle,
+                             u64 start_blk,
+                             unsigned int num_clusters);
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+
+/*
+ * Process local structure which describes the block unlinks done
+ * during an operation. This is populated via
+ * ocfs2_cache_block_dealloc().
+ *
+ * ocfs2_run_deallocs() should be called after the potentially
+ * de-allocating routines. No journal handles should be open, and most
+ * locks should have been dropped.
+ */
+struct ocfs2_cached_dealloc_ctxt {
+       struct ocfs2_per_slot_free_list         *c_first_suballocator;
+};
+static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
+{
+       c->c_first_suballocator = NULL;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+                      struct ocfs2_cached_dealloc_ctxt *ctxt);
 
 struct ocfs2_truncate_context {
-       struct inode *tc_ext_alloc_inode;
-       struct buffer_head *tc_ext_alloc_bh;
+       struct ocfs2_cached_dealloc_ctxt tc_dealloc;
        int tc_ext_alloc_locked; /* is it cluster locked? */
        /* these get destroyed once it's passed to ocfs2_commit_truncate. */
        struct buffer_head *tc_last_eb_bh;
 };
 
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
-                                u64 new_i_size);
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 range_start, u64 range_end);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 
 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
                    u32 cpos, struct buffer_head **leaf_bh);
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 
 /*
  * Helper function to look at the # of clusters in an extent record.
index a480b09c79b916de88252129919bedbbfb10850a..84bf6e79de235b88bb7246042f39e73805ae6cb1 100644 (file)
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
             bh = bh->b_this_page, block_start += bsize) {
                block_end = block_start + bsize;
 
+               clear_buffer_new(bh);
+
                /*
                 * Ignore blocks outside of our i/o range -
                 * they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                 * For an allocating write with cluster size >= page
                 * size, we always write the entire page.
                 */
-
-               if (buffer_new(bh))
-                       clear_buffer_new(bh);
+               if (new)
+                       set_buffer_new(bh);
 
                if (!buffer_mapped(bh)) {
                        map_bh(bh, inode->i_sb, *p_blkno);
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
-                    (block_start < from || block_end > to)) {
+                          !buffer_new(bh) &&
+                          (block_start < from || block_end > to)) {
                        ll_rw_block(READ, 1, &bh);
                        *wait_bh++=bh;
                }
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
        bh = head;
        block_start = 0;
        do {
-               void *kaddr;
-
                block_end = block_start + bsize;
                if (block_end <= from)
                        goto next_bh;
                if (block_start >= to)
                        break;
 
-               kaddr = kmap_atomic(page, KM_USER0);
-               memset(kaddr+block_start, 0, bh->b_size);
-               flush_dcache_page(page);
-               kunmap_atomic(kaddr, KM_USER0);
+               zero_user_page(page, block_start, bh->b_size, KM_USER0);
                set_buffer_uptodate(bh);
                mark_buffer_dirty(bh);
 
@@ -761,217 +758,240 @@ next_bh:
        return ret;
 }
 
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES   1
+#else
+#define OCFS2_MAX_CTXT_PAGES   (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+
+#define OCFS2_MAX_CLUSTERS_PER_PAGE    (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
 /*
- * This will copy user data from the buffer page in the splice
- * context.
- *
- * For now, we ignore SPLICE_F_MOVE as that would require some extra
- * communication out all the way to ocfs2_write().
+ * Describe the state of a single cluster to be written to.
  */
-int ocfs2_map_and_write_splice_data(struct inode *inode,
-                                 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
-                                 unsigned int *ret_from, unsigned int *ret_to)
+struct ocfs2_write_cluster_desc {
+       u32             c_cpos;
+       u32             c_phys;
+       /*
+        * Give this a unique field because c_phys eventually gets
+        * filled.
+        */
+       unsigned        c_new;
+       unsigned        c_unwritten;
+};
+
+static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
 {
-       int ret;
-       unsigned int to, from, cluster_start, cluster_end;
-       char *src, *dst;
-       struct ocfs2_splice_write_priv *sp = wc->w_private;
-       struct pipe_buffer *buf = sp->s_buf;
-       unsigned long bytes, src_from;
-       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       return d->c_new || d->c_unwritten;
+}
 
-       ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
-                                       &cluster_end);
+struct ocfs2_write_ctxt {
+       /* Logical cluster position / len of write */
+       u32                             w_cpos;
+       u32                             w_clen;
 
-       from = sp->s_offset;
-       src_from = sp->s_buf_offset;
-       bytes = wc->w_count;
+       struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 
-       if (wc->w_large_pages) {
-               /*
-                * For cluster size < page size, we have to
-                * calculate pos within the cluster and obey
-                * the rightmost boundary.
-                */
-               bytes = min(bytes, (unsigned long)(osb->s_clustersize
-                                  - (wc->w_pos & (osb->s_clustersize - 1))));
-       }
-       to = from + bytes;
+       /*
+        * This is true if page_size > cluster_size.
+        *
+        * It triggers a set of special cases during write which might
+        * have to deal with allocating writes to partial pages.
+        */
+       unsigned int                    w_large_pages;
+
+       /*
+        * Pages involved in this write.
+        *
+        * w_target_page is the page being written to by the user.
+        *
+        * w_pages is an array of pages which always contains
+        * w_target_page, and in the case of an allocating write with
+        * page_size < cluster size, it will contain zero'd and mapped
+        * pages adjacent to w_target_page which need to be written
+        * out in so that future reads from that region will get
+        * zero's.
+        */
+       struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
+       unsigned int                    w_num_pages;
+       struct page                     *w_target_page;
 
-       BUG_ON(from > PAGE_CACHE_SIZE);
-       BUG_ON(to > PAGE_CACHE_SIZE);
-       BUG_ON(from < cluster_start);
-       BUG_ON(to > cluster_end);
+       /*
+        * ocfs2_write_end() uses this to know what the real range to
+        * write in the target should be.
+        */
+       unsigned int                    w_target_from;
+       unsigned int                    w_target_to;
 
-       if (wc->w_this_page_new)
-               ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-                                           cluster_start, cluster_end, 1);
-       else
-               ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-                                           from, to, 0);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
+       /*
+        * We could use journal_current_handle() but this is cleaner,
+        * IMHO -Mark
+        */
+       handle_t                        *w_handle;
+
+       struct buffer_head              *w_di_bh;
+
+       struct ocfs2_cached_dealloc_ctxt w_dealloc;
+};
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+       int i;
+
+       for(i = 0; i < wc->w_num_pages; i++) {
+               if (wc->w_pages[i] == NULL)
+                       continue;
+
+               unlock_page(wc->w_pages[i]);
+               mark_page_accessed(wc->w_pages[i]);
+               page_cache_release(wc->w_pages[i]);
        }
 
-       src = buf->ops->map(sp->s_pipe, buf, 1);
-       dst = kmap_atomic(wc->w_this_page, KM_USER1);
-       memcpy(dst + from, src + src_from, bytes);
-       kunmap_atomic(wc->w_this_page, KM_USER1);
-       buf->ops->unmap(sp->s_pipe, buf, src);
+       brelse(wc->w_di_bh);
+       kfree(wc);
+}
+
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+                                 struct ocfs2_super *osb, loff_t pos,
+                                 unsigned len, struct buffer_head *di_bh)
+{
+       struct ocfs2_write_ctxt *wc;
+
+       wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+       if (!wc)
+               return -ENOMEM;
 
-       wc->w_finished_copy = 1;
+       wc->w_cpos = pos >> osb->s_clustersize_bits;
+       wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
+       get_bh(di_bh);
+       wc->w_di_bh = di_bh;
 
-       *ret_from = from;
-       *ret_to = to;
-out:
+       if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+               wc->w_large_pages = 1;
+       else
+               wc->w_large_pages = 0;
+
+       ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+
+       *wcp = wc;
 
-       return bytes ? (unsigned int)bytes : ret;
+       return 0;
 }
 
 /*
- * This will copy user data from the iovec in the buffered write
- * context.
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
  */
-int ocfs2_map_and_write_user_data(struct inode *inode,
-                                 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
-                                 unsigned int *ret_from, unsigned int *ret_to)
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 {
-       int ret;
-       unsigned int to, from, cluster_start, cluster_end;
-       unsigned long bytes, src_from;
-       char *dst;
-       struct ocfs2_buffered_write_priv *bp = wc->w_private;
-       const struct iovec *cur_iov = bp->b_cur_iov;
-       char __user *buf;
-       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       unsigned int block_start, block_end;
+       struct buffer_head *head, *bh;
 
-       ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
-                                       &cluster_end);
+       BUG_ON(!PageLocked(page));
+       if (!page_has_buffers(page))
+               return;
 
-       buf = cur_iov->iov_base + bp->b_cur_off;
-       src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+       bh = head = page_buffers(page);
+       block_start = 0;
+       do {
+               block_end = block_start + bh->b_size;
 
-       from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+               if (buffer_new(bh)) {
+                       if (block_end > from && block_start < to) {
+                               if (!PageUptodate(page)) {
+                                       unsigned start, end;
 
-       /*
-        * This is a lot of comparisons, but it reads quite
-        * easily, which is important here.
-        */
-       /* Stay within the src page */
-       bytes = PAGE_SIZE - src_from;
-       /* Stay within the vector */
-       bytes = min(bytes,
-                   (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
-       /* Stay within count */
-       bytes = min(bytes, (unsigned long)wc->w_count);
-       /*
-        * For clustersize > page size, just stay within
-        * target page, otherwise we have to calculate pos
-        * within the cluster and obey the rightmost
-        * boundary.
-        */
-       if (wc->w_large_pages) {
-               /*
-                * For cluster size < page size, we have to
-                * calculate pos within the cluster and obey
-                * the rightmost boundary.
-                */
-               bytes = min(bytes, (unsigned long)(osb->s_clustersize
-                                  - (wc->w_pos & (osb->s_clustersize - 1))));
-       } else {
-               /*
-                * cluster size > page size is the most common
-                * case - we just stay within the target page
-                * boundary.
-                */
-               bytes = min(bytes, PAGE_CACHE_SIZE - from);
-       }
+                                       start = max(from, block_start);
+                                       end = min(to, block_end);
 
-       to = from + bytes;
+                                       zero_user_page(page, start, end - start, KM_USER0);
+                                       set_buffer_uptodate(bh);
+                               }
 
-       BUG_ON(from > PAGE_CACHE_SIZE);
-       BUG_ON(to > PAGE_CACHE_SIZE);
-       BUG_ON(from < cluster_start);
-       BUG_ON(to > cluster_end);
+                               clear_buffer_new(bh);
+                               mark_buffer_dirty(bh);
+                       }
+               }
 
-       if (wc->w_this_page_new)
-               ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-                                           cluster_start, cluster_end, 1);
-       else
-               ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-                                           from, to, 0);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
+               block_start = block_end;
+               bh = bh->b_this_page;
+       } while (bh != head);
+}
 
-       dst = kmap(wc->w_this_page);
-       memcpy(dst + from, bp->b_src_buf + src_from, bytes);
-       kunmap(wc->w_this_page);
+/*
+ * Only called when we have a failure during allocating write to write
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+                               struct ocfs2_write_ctxt *wc,
+                               loff_t user_pos, unsigned user_len)
+{
+       int i;
+       unsigned from, to;
+       struct page *tmppage;
 
-       /*
-        * XXX: This is slow, but simple. The caller of
-        * ocfs2_buffered_write_cluster() is responsible for
-        * passing through the iovecs, so it's difficult to
-        * predict what our next step is in here after our
-        * initial write. A future version should be pushing
-        * that iovec manipulation further down.
-        *
-        * By setting this, we indicate that a copy from user
-        * data was done, and subsequent calls for this
-        * cluster will skip copying more data.
-        */
-       wc->w_finished_copy = 1;
+       ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
 
-       *ret_from = from;
-       *ret_to = to;
-out:
+       if (wc->w_large_pages) {
+               from = wc->w_target_from;
+               to = wc->w_target_to;
+       } else {
+               from = 0;
+               to = PAGE_CACHE_SIZE;
+       }
+
+       for(i = 0; i < wc->w_num_pages; i++) {
+               tmppage = wc->w_pages[i];
 
-       return bytes ? (unsigned int)bytes : ret;
+               if (ocfs2_should_order_data(inode))
+                       walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                                         from, to, NULL,
+                                         ocfs2_journal_dirty_data);
+
+               block_commit_write(tmppage, from, to);
+       }
 }
 
-/*
- * Map, fill and write a page to disk.
- *
- * The work of copying data is done via callback.  Newly allocated
- * pages which don't take user data will be zero'd (set 'new' to
- * indicate an allocating write)
- *
- * Returns a negative error code or the number of bytes copied into
- * the page.
- */
-static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
-                                u64 *p_blkno, struct page *page,
-                                struct ocfs2_write_ctxt *wc, int new)
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
+                                       struct ocfs2_write_ctxt *wc,
+                                       struct page *page, u32 cpos,
+                                       loff_t user_pos, unsigned user_len,
+                                       int new)
 {
-       int ret, copied = 0;
-       unsigned int from = 0, to = 0;
+       int ret;
+       unsigned int map_from = 0, map_to = 0;
        unsigned int cluster_start, cluster_end;
-       unsigned int zero_from = 0, zero_to = 0;
+       unsigned int user_data_from = 0, user_data_to = 0;
 
-       ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+       ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
                                        &cluster_start, &cluster_end);
 
-       if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
-           && !wc->w_finished_copy) {
-
-               wc->w_this_page = page;
-               wc->w_this_page_new = new;
-               ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
-               if (ret < 0) {
+       if (page == wc->w_target_page) {
+               map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+               map_to = map_from + user_len;
+
+               if (new)
+                       ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+                                                   cluster_start, cluster_end,
+                                                   new);
+               else
+                       ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+                                                   map_from, map_to, new);
+               if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
 
-               copied = ret;
-
-               zero_from = from;
-               zero_to = to;
+               user_data_from = map_from;
+               user_data_to = map_to;
                if (new) {
-                       from = cluster_start;
-                       to = cluster_end;
+                       map_from = cluster_start;
+                       map_to = cluster_end;
                }
+
+               wc->w_target_from = map_from;
+               wc->w_target_to = map_to;
        } else {
                /*
                 * If we haven't allocated the new page yet, we
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
                 */
                BUG_ON(!new);
 
-               from = cluster_start;
-               to = cluster_end;
+               map_from = cluster_start;
+               map_to = cluster_end;
 
                ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-                                           cluster_start, cluster_end, 1);
+                                           cluster_start, cluster_end, new);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
         */
        if (new && !PageUptodate(page))
                ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
-                                        wc->w_cpos, zero_from, zero_to);
+                                        cpos, user_data_from, user_data_to);
 
        flush_dcache_page(page);
 
-       if (ocfs2_should_order_data(inode)) {
-               ret = walk_page_buffers(handle,
-                                       page_buffers(page),
-                                       from, to, NULL,
-                                       ocfs2_journal_dirty_data);
-               if (ret < 0)
-                       mlog_errno(ret);
-       }
-
-       /*
-        * We don't use generic_commit_write() because we need to
-        * handle our own i_size update.
-        */
-       ret = block_commit_write(page, from, to);
-       if (ret)
-               mlog_errno(ret);
 out:
-
-       return copied ? copied : ret;
+       return ret;
 }
 
 /*
- * Do the actual write of some data into an inode. Optionally allocate
- * in order to fulfill the write.
- *
- * cpos is the logical cluster offset within the file to write at
- *
- * 'phys' is the physical mapping of that offset. a 'phys' value of
- * zero indicates that allocation is required. In this case, data_ac
- * and meta_ac should be valid (meta_ac can be null if metadata
- * allocation isn't required).
+ * This function will only grab one clusters worth of pages.
  */
-static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
-                          struct buffer_head *di_bh,
-                          struct ocfs2_alloc_context *data_ac,
-                          struct ocfs2_alloc_context *meta_ac,
-                          struct ocfs2_write_ctxt *wc)
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
+                                     struct ocfs2_write_ctxt *wc,
+                                     u32 cpos, loff_t user_pos, int new,
+                                     struct page *mmap_page)
 {
-       int ret, i, numpages = 1, new;
-       unsigned int copied = 0;
-       u32 tmp_pos;
-       u64 v_blkno, p_blkno;
-       struct address_space *mapping = file->f_mapping;
+       int ret = 0, i;
+       unsigned long start, target_index, index;
        struct inode *inode = mapping->host;
-       unsigned long index, start;
-       struct page **cpages;
 
-       new = phys == 0 ? 1 : 0;
+       target_index = user_pos >> PAGE_CACHE_SHIFT;
 
        /*
         * Figure out how many pages we'll be manipulating here. For
         * non allocating write, we just change the one
         * page. Otherwise, we'll need a whole clusters worth.
         */
-       if (new)
-               numpages = ocfs2_pages_per_cluster(inode->i_sb);
-
-       cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
-       if (!cpages) {
-               ret = -ENOMEM;
-               mlog_errno(ret);
-               return ret;
-       }
-
-       /*
-        * Fill our page array first. That way we've grabbed enough so
-        * that we can zero and flush if we error after adding the
-        * extent.
-        */
        if (new) {
-               start = ocfs2_align_clusters_to_page_index(inode->i_sb,
-                                                          wc->w_cpos);
-               v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+               wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+               start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
        } else {
-               start = wc->w_pos >> PAGE_CACHE_SHIFT;
-               v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+               wc->w_num_pages = 1;
+               start = target_index;
        }
 
-       for(i = 0; i < numpages; i++) {
+       for(i = 0; i < wc->w_num_pages; i++) {
                index = start + i;
 
-               cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
-               if (!cpages[i]) {
-                       ret = -ENOMEM;
-                       mlog_errno(ret);
-                       goto out;
+               if (index == target_index && mmap_page) {
+                       /*
+                        * ocfs2_pagemkwrite() is a little different
+                        * and wants us to directly use the page
+                        * passed in.
+                        */
+                       lock_page(mmap_page);
+
+                       if (mmap_page->mapping != mapping) {
+                               unlock_page(mmap_page);
+                               /*
+                                * Sanity check - the locking in
+                                * ocfs2_pagemkwrite() should ensure
+                                * that this code doesn't trigger.
+                                */
+                               ret = -EINVAL;
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       page_cache_get(mmap_page);
+                       wc->w_pages[i] = mmap_page;
+               } else {
+                       wc->w_pages[i] = find_or_create_page(mapping, index,
+                                                            GFP_NOFS);
+                       if (!wc->w_pages[i]) {
+                               ret = -ENOMEM;
+                               mlog_errno(ret);
+                               goto out;
+                       }
                }
+
+               if (index == target_index)
+                       wc->w_target_page = wc->w_pages[i];
        }
+out:
+       return ret;
+}
+
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+                              u32 phys, unsigned int unwritten,
+                              struct ocfs2_alloc_context *data_ac,
+                              struct ocfs2_alloc_context *meta_ac,
+                              struct ocfs2_write_ctxt *wc, u32 cpos,
+                              loff_t user_pos, unsigned user_len)
+{
+       int ret, i, new, should_zero = 0;
+       u64 v_blkno, p_blkno;
+       struct inode *inode = mapping->host;
+
+       new = phys == 0 ? 1 : 0;
+       if (new || unwritten)
+               should_zero = 1;
 
        if (new) {
+               u32 tmp_pos;
+
                /*
                 * This is safe to call with the page locks - it won't take
                 * any additional semaphores or cluster locks.
                 */
-               tmp_pos = wc->w_cpos;
+               tmp_pos = cpos;
                ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-                                                &tmp_pos, 1, di_bh, handle,
-                                                data_ac, meta_ac, NULL);
+                                                &tmp_pos, 1, 0, wc->w_di_bh,
+                                                wc->w_handle, data_ac,
+                                                meta_ac, NULL);
                /*
                 * This shouldn't happen because we must have already
                 * calculated the correct meta data allocation required. The
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
                        mlog_errno(ret);
                        goto out;
                }
+       } else if (unwritten) {
+               ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+                                               wc->w_handle, cpos, 1, phys,
+                                               meta_ac, &wc->w_dealloc);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
        }
 
+       if (should_zero)
+               v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+       else
+               v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+
+       /*
+        * The only reason this should fail is due to an inability to
+        * find the extent added.
+        */
        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
                                          NULL);
        if (ret < 0) {
-
-               /*
-                * XXX: Should we go readonly here?
-                */
-
-               mlog_errno(ret);
+               ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
+                           "at logical block %llu",
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                           (unsigned long long)v_blkno);
                goto out;
        }
 
        BUG_ON(p_blkno == 0);
 
-       for(i = 0; i < numpages; i++) {
-               ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
-                                           wc, new);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out;
+       for(i = 0; i < wc->w_num_pages; i++) {
+               int tmpret;
+
+               tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
+                                                     wc->w_pages[i], cpos,
+                                                     user_pos, user_len,
+                                                     should_zero);
+               if (tmpret) {
+                       mlog_errno(tmpret);
+                       if (ret == 0)
+                               tmpret = ret;
                }
-
-               copied += ret;
        }
 
+       /*
+        * We only have cleanup to do in case of allocating write.
+        */
+       if (ret && new)
+               ocfs2_write_failure(inode, wc, user_pos, user_len);
+
 out:
-       for(i = 0; i < numpages; i++) {
-               unlock_page(cpages[i]);
-               mark_page_accessed(cpages[i]);
-               page_cache_release(cpages[i]);
+
+       return ret;
+}
+
+static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
+                                      struct ocfs2_alloc_context *data_ac,
+                                      struct ocfs2_alloc_context *meta_ac,
+                                      struct ocfs2_write_ctxt *wc,
+                                      loff_t pos, unsigned len)
+{
+       int ret, i;
+       struct ocfs2_write_cluster_desc *desc;
+
+       for (i = 0; i < wc->w_clen; i++) {
+               desc = &wc->w_desc[i];
+
+               ret = ocfs2_write_cluster(mapping, desc->c_phys,
+                                         desc->c_unwritten, data_ac, meta_ac,
+                                         wc, desc->c_cpos, pos, len);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
        }
-       kfree(cpages);
 
-       return copied ? copied : ret;
+       ret = 0;
+out:
+       return ret;
 }
 
-static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
-                                 struct ocfs2_super *osb, loff_t pos,
-                                 size_t count, ocfs2_page_writer *cb,
-                                 void *cb_priv)
+/*
+ * ocfs2_write_end() wants to know which parts of the target page it
+ * should complete the write on. It's easiest to compute them ahead of
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+                                       struct ocfs2_write_ctxt *wc,
+                                       loff_t pos, unsigned len, int alloc)
 {
-       wc->w_count = count;
-       wc->w_pos = pos;
-       wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
-       wc->w_finished_copy = 0;
+       struct ocfs2_write_cluster_desc *desc;
 
-       if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
-               wc->w_large_pages = 1;
-       else
-               wc->w_large_pages = 0;
+       wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+       wc->w_target_to = wc->w_target_from + len;
 
-       wc->w_write_data_page = cb;
-       wc->w_private = cb_priv;
+       if (alloc == 0)
+               return;
+
+       /*
+        * Allocating write - we may have different boundaries based
+        * on page size and cluster size.
+        *
+        * NOTE: We can no longer compute one value from the other as
+        * the actual write length and user provided length may be
+        * different.
+        */
+
+       if (wc->w_large_pages) {
+               /*
+                * We only care about the 1st and last cluster within
+                * our range and whether they should be zero'd or not. Either
+                * value may be extended out to the start/end of a
+                * newly allocated cluster.
+                */
+               desc = &wc->w_desc[0];
+               if (ocfs2_should_zero_cluster(desc))
+                       ocfs2_figure_cluster_boundaries(osb,
+                                                       desc->c_cpos,
+                                                       &wc->w_target_from,
+                                                       NULL);
+
+               desc = &wc->w_desc[wc->w_clen - 1];
+               if (ocfs2_should_zero_cluster(desc))
+                       ocfs2_figure_cluster_boundaries(osb,
+                                                       desc->c_cpos,
+                                                       NULL,
+                                                       &wc->w_target_to);
+       } else {
+               wc->w_target_from = 0;
+               wc->w_target_to = PAGE_CACHE_SIZE;
+       }
 }
 
 /*
- * Write a cluster to an inode. The cluster may not be allocated yet,
- * in which case it will be. This only exists for buffered writes -
- * O_DIRECT takes a more "traditional" path through the kernel.
- *
- * The caller is responsible for incrementing pos, written counts, etc
+ * Populate each single-cluster write descriptor in the write context
+ * with information about the i/o to be done.
  *
- * For file systems that don't support sparse files, pre-allocation
- * and page zeroing up until cpos should be done prior to this
- * function call.
- *
- * Callers should be holding i_sem, and the rw cluster lock.
- *
- * Returns the number of user bytes written, or less than zero for
- * error.
+ * Returns the number of clusters that will have to be allocated, as
+ * well as a worst case estimate of the number of extent records that
+ * would have to be created during a write to an unwritten region.
  */
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
-                                    size_t count, ocfs2_page_writer *actor,
-                                    void *priv)
+static int ocfs2_populate_write_desc(struct inode *inode,
+                                    struct ocfs2_write_ctxt *wc,
+                                    unsigned int *clusters_to_alloc,
+                                    unsigned int *extents_to_split)
+{
+       int ret;
+       struct ocfs2_write_cluster_desc *desc;
+       unsigned int num_clusters = 0;
+       unsigned int ext_flags = 0;
+       u32 phys = 0;
+       int i;
+
+       *clusters_to_alloc = 0;
+       *extents_to_split = 0;
+
+       for (i = 0; i < wc->w_clen; i++) {
+               desc = &wc->w_desc[i];
+               desc->c_cpos = wc->w_cpos + i;
+
+               if (num_clusters == 0) {
+                       /*
+                        * Need to look up the next extent record.
+                        */
+                       ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+                                                &num_clusters, &ext_flags);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       /*
+                        * Assume worst case - that we're writing in
+                        * the middle of the extent.
+                        *
+                        * We can assume that the write proceeds from
+                        * left to right, in which case the extent
+                        * insert code is smart enough to coalesce the
+                        * next splits into the previous records created.
+                        */
+                       if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                               *extents_to_split = *extents_to_split + 2;
+               } else if (phys) {
+                       /*
+                        * Only increment phys if it doesn't describe
+                        * a hole.
+                        */
+                       phys++;
+               }
+
+               desc->c_phys = phys;
+               if (phys == 0) {
+                       desc->c_new = 1;
+                       *clusters_to_alloc = *clusters_to_alloc + 1;
+               }
+               if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                       desc->c_unwritten = 1;
+
+               num_clusters--;
+       }
+
+       ret = 0;
+out:
+       return ret;
+}
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata,
+                            struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
-       ssize_t written = 0;
-       u32 phys;
-       struct inode *inode = file->f_mapping->host;
+       unsigned int clusters_to_alloc, extents_to_split;
+       struct ocfs2_write_ctxt *wc;
+       struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
-       struct ocfs2_write_ctxt wc;
-
-       ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
 
-       ret = ocfs2_meta_lock(inode, &di_bh, 1);
+       ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
                mlog_errno(ret);
-               goto out;
+               return ret;
        }
-       di = (struct ocfs2_dinode *)di_bh->b_data;
-
-       /*
-        * Take alloc sem here to prevent concurrent lookups. That way
-        * the mapping, zeroing and tree manipulation within
-        * ocfs2_write() will be safe against ->readpage(). This
-        * should also serve to lock out allocation from a shared
-        * writeable region.
-        */
-       down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-       ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+       ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+                                       &extents_to_split);
        if (ret) {
                mlog_errno(ret);
-               goto out_meta;
+               goto out;
        }
 
-       /* phys == 0 means that allocation is required. */
-       if (phys == 0) {
-               ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+       di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+       /*
+        * We set w_target_from, w_target_to here so that
+        * ocfs2_write_end() knows which range in the target page to
+        * write out. An allocation requires that we write the entire
+        * cluster range.
+        */
+       if (clusters_to_alloc || extents_to_split) {
+               /*
+                * XXX: We are stretching the limits of
+                * ocfs2_lock_allocators(). It greatly over-estimates
+                * the work to be done.
+                */
+               ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
+                                           extents_to_split, &data_ac, &meta_ac);
                if (ret) {
                        mlog_errno(ret);
-                       goto out_meta;
+                       goto out;
                }
 
-               credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
-       }
+               credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+                                                   clusters_to_alloc);
 
-       ret = ocfs2_data_lock(inode, 1);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_meta;
        }
 
+       ocfs2_set_target_boundaries(osb, wc, pos, len,
+                                   clusters_to_alloc + extents_to_split);
+
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-               goto out_data;
+               goto out;
        }
 
-       written = ocfs2_write(file, phys, handle, di_bh, data_ac,
-                             meta_ac, &wc);
-       if (written < 0) {
-               ret = written;
+       wc->w_handle = handle;
+
+       /*
+        * We don't want this to fail in ocfs2_write_end(), so do it
+        * here.
+        */
+       ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
                mlog_errno(ret);
                goto out_commit;
        }
 
-       ret = ocfs2_journal_access(handle, inode, di_bh,
-                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       /*
+        * Fill our page array first. That way we've grabbed enough so
+        * that we can zero and flush if we error after adding the
+        * extent.
+        */
+       ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+                                        clusters_to_alloc + extents_to_split,
+                                        mmap_page);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
        }
 
-       pos += written;
+       ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
+                                         len);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       if (data_ac)
+               ocfs2_free_alloc_context(data_ac);
+       if (meta_ac)
+               ocfs2_free_alloc_context(meta_ac);
+
+       *pagep = wc->w_target_page;
+       *fsdata = wc;
+       return 0;
+out_commit:
+       ocfs2_commit_trans(osb, handle);
+
+out:
+       ocfs2_free_write_ctxt(wc);
+
+       if (data_ac)
+               ocfs2_free_alloc_context(data_ac);
+       if (meta_ac)
+               ocfs2_free_alloc_context(meta_ac);
+       return ret;
+}
+
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+                     loff_t pos, unsigned len, unsigned flags,
+                     struct page **pagep, void **fsdata)
+{
+       int ret;
+       struct buffer_head *di_bh = NULL;
+       struct inode *inode = mapping->host;
+
+       ret = ocfs2_meta_lock(inode, &di_bh, 1);
+       if (ret) {
+               mlog_errno(ret);
+               return ret;
+       }
+
+       /*
+        * Take alloc sem here to prevent concurrent lookups. That way
+        * the mapping, zeroing and tree manipulation within
+        * ocfs2_write() will be safe against ->readpage(). This
+        * should also serve to lock out allocation from a shared
+        * writeable region.
+        */
+       down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ret = ocfs2_data_lock(inode, 1);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_fail;
+       }
+
+       ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+                                      fsdata, di_bh, NULL);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_fail_data;
+       }
+
+       brelse(di_bh);
+
+       return 0;
+
+out_fail_data:
+       ocfs2_data_unlock(inode, 1);
+out_fail:
+       up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+       brelse(di_bh);
+       ocfs2_meta_unlock(inode, 1);
+
+       return ret;
+}
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
+{
+       int i;
+       unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+       struct inode *inode = mapping->host;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct ocfs2_write_ctxt *wc = fsdata;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+       handle_t *handle = wc->w_handle;
+       struct page *tmppage;
+
+       if (unlikely(copied < len)) {
+               if (!PageUptodate(wc->w_target_page))
+                       copied = 0;
+
+               ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+                                      start+len);
+       }
+       flush_dcache_page(wc->w_target_page);
+
+       for(i = 0; i < wc->w_num_pages; i++) {
+               tmppage = wc->w_pages[i];
+
+               if (tmppage == wc->w_target_page) {
+                       from = wc->w_target_from;
+                       to = wc->w_target_to;
+
+                       BUG_ON(from > PAGE_CACHE_SIZE ||
+                              to > PAGE_CACHE_SIZE ||
+                              to < from);
+               } else {
+                       /*
+                        * Pages adjacent to the target (if any) imply
+                        * a hole-filling write in which case we want
+                        * to flush their entire range.
+                        */
+                       from = 0;
+                       to = PAGE_CACHE_SIZE;
+               }
+
+               if (ocfs2_should_order_data(inode))
+                       walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                                         from, to, NULL,
+                                         ocfs2_journal_dirty_data);
+
+               block_commit_write(tmppage, from, to);
+       }
+
+       pos += copied;
        if (pos > inode->i_size) {
                i_size_write(inode, pos);
                mark_inode_dirty(inode);
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+       ocfs2_journal_dirty(handle, wc->w_di_bh);
 
-       ret = ocfs2_journal_dirty(handle, di_bh);
-       if (ret)
-               mlog_errno(ret);
-
-out_commit:
        ocfs2_commit_trans(osb, handle);
 
-out_data:
-       ocfs2_data_unlock(inode, 1);
+       ocfs2_run_deallocs(osb, &wc->w_dealloc);
+
+       ocfs2_free_write_ctxt(wc);
+
+       return copied;
+}
+
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+                   loff_t pos, unsigned len, unsigned copied,
+                   struct page *page, void *fsdata)
+{
+       int ret;
+       struct inode *inode = mapping->host;
 
-out_meta:
+       ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+
+       ocfs2_data_unlock(inode, 1);
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        ocfs2_meta_unlock(inode, 1);
 
-out:
-       brelse(di_bh);
-       if (data_ac)
-               ocfs2_free_alloc_context(data_ac);
-       if (meta_ac)
-               ocfs2_free_alloc_context(meta_ac);
-
-       return written ? written : ret;
+       return ret;
 }
 
 const struct address_space_operations ocfs2_aops = {
index 45821d479b5a3662028d99847dc64835f39b92fb..389579bd64e372e8294858d63d5b0d913fee648a 100644 (file)
@@ -42,57 +42,22 @@ int walk_page_buffers(      handle_t *handle,
                        int (*fn)(      handle_t *handle,
                                        struct buffer_head *bh));
 
-struct ocfs2_write_ctxt;
-typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
-                               u64 *, unsigned int *, unsigned int *);
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+                     loff_t pos, unsigned len, unsigned flags,
+                     struct page **pagep, void **fsdata);
 
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
-                                    size_t count, ocfs2_page_writer *actor,
-                                    void *priv);
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+                   loff_t pos, unsigned len, unsigned copied,
+                   struct page *page, void *fsdata);
 
-struct ocfs2_write_ctxt {
-       size_t                          w_count;
-       loff_t                          w_pos;
-       u32                             w_cpos;
-       unsigned int                    w_finished_copy;
+int ocfs2_write_end_nolock(struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata);
 
-       /* This is true if page_size > cluster_size */
-       unsigned int                    w_large_pages;
-
-       /* Filler callback and private data */
-       ocfs2_page_writer               *w_write_data_page;
-       void                            *w_private;
-
-       /* Only valid for the filler callback */
-       struct page                     *w_this_page;
-       unsigned int                    w_this_page_new;
-};
-
-struct ocfs2_buffered_write_priv {
-       char                            *b_src_buf;
-       const struct iovec              *b_cur_iov; /* Current iovec */
-       size_t                          b_cur_off; /* Offset in the
-                                                   * current iovec */
-};
-int ocfs2_map_and_write_user_data(struct inode *inode,
-                                 struct ocfs2_write_ctxt *wc,
-                                 u64 *p_blkno,
-                                 unsigned int *ret_from,
-                                 unsigned int *ret_to);
-
-struct ocfs2_splice_write_priv {
-       struct splice_desc              *s_sd;
-       struct pipe_buffer              *s_buf;
-       struct pipe_inode_info          *s_pipe;
-       /* Neither offset value is ever larger than one page */
-       unsigned int                    s_offset;
-       unsigned int                    s_buf_offset;
-};
-int ocfs2_map_and_write_splice_data(struct inode *inode,
-                                   struct ocfs2_write_ctxt *wc,
-                                   u64 *p_blkno,
-                                   unsigned int *ret_from,
-                                   unsigned int *ret_to);
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata,
+                            struct buffer_head *di_bh, struct page *mmap_page);
 
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
index 979113479c664a2a8ea01bab4fa20ff3e8e0dadb..2bd7f788cf34a73f9ca59694f6f8103e943733e3 100644 (file)
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        ret = wait_event_interruptible(o2hb_steady_queue,
                                atomic_read(&reg->hr_steady_iterations) == 0);
        if (ret) {
+               /* We got interrupted (hello ptrace!).  Clean up */
                spin_lock(&o2hb_live_lock);
                hb_task = reg->hr_task;
                reg->hr_task = NULL;
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                goto out;
        }
 
-       ret = count;
+       /* Ok, we were woken.  Make sure it wasn't by drop_item() */
+       spin_lock(&o2hb_live_lock);
+       hb_task = reg->hr_task;
+       spin_unlock(&o2hb_live_lock);
+
+       if (hb_task)
+               ret = count;
+       else
+               ret = -EIO;
+
 out:
        if (filp)
                fput(filp);
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (hb_task)
                kthread_stop(hb_task);
 
+       /*
+        * If we're racing a dev_write(), we need to wake them.  They will
+        * check reg->hr_task
+        */
+       if (atomic_read(&reg->hr_steady_iterations) != 0) {
+               atomic_set(&reg->hr_steady_iterations, 0);
+               wake_up(&o2hb_steady_queue);
+       }
+
        config_item_put(item);
 }
 
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
 
-int o2hb_register_callback(struct o2hb_callback_func *hc)
+static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+{
+       struct o2hb_region *p, *reg = NULL;
+
+       assert_spin_locked(&o2hb_live_lock);
+
+       list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+               if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+                       reg = p;
+                       break;
+               }
+       }
+
+       return reg;
+}
+
+static int o2hb_region_get(const char *region_uuid)
+{
+       int ret = 0;
+       struct o2hb_region *reg;
+
+       spin_lock(&o2hb_live_lock);
+
+       reg = o2hb_find_region(region_uuid);
+       if (!reg)
+               ret = -ENOENT;
+       spin_unlock(&o2hb_live_lock);
+
+       if (ret)
+               goto out;
+
+       ret = o2nm_depend_this_node();
+       if (ret)
+               goto out;
+
+       ret = o2nm_depend_item(&reg->hr_item);
+       if (ret)
+               o2nm_undepend_this_node();
+
+out:
+       return ret;
+}
+
+static void o2hb_region_put(const char *region_uuid)
+{
+       struct o2hb_region *reg;
+
+       spin_lock(&o2hb_live_lock);
+
+       reg = o2hb_find_region(region_uuid);
+
+       spin_unlock(&o2hb_live_lock);
+
+       if (reg) {
+               o2nm_undepend_item(&reg->hr_item);
+               o2nm_undepend_this_node();
+       }
+}
+
+int o2hb_register_callback(const char *region_uuid,
+                          struct o2hb_callback_func *hc)
 {
        struct o2hb_callback_func *tmp;
        struct list_head *iter;
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
                goto out;
        }
 
+       if (region_uuid) {
+               ret = o2hb_region_get(region_uuid);
+               if (ret)
+                       goto out;
+       }
+
        down_write(&o2hb_callback_sem);
 
        list_for_each(iter, &hbcall->list) {
@@ -1702,16 +1787,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(o2hb_register_callback);
 
-void o2hb_unregister_callback(struct o2hb_callback_func *hc)
+void o2hb_unregister_callback(const char *region_uuid,
+                             struct o2hb_callback_func *hc)
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
 
        mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
 
+       /* XXX Can this happen _with_ a region reference? */
        if (list_empty(&hc->hc_item))
                return;
 
+       if (region_uuid)
+               o2hb_region_put(region_uuid);
+
        down_write(&o2hb_callback_sem);
 
        list_del_init(&hc->hc_item);
index cc6d40b397715bda85d68643f90a3cb267f47580..35397dd5ecdbb42c3922cbc9e930081c4c522b87 100644 (file)
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
                         o2hb_cb_func *func,
                         void *data,
                         int priority);
-int o2hb_register_callback(struct o2hb_callback_func *hc);
-void o2hb_unregister_callback(struct o2hb_callback_func *hc);
+int o2hb_register_callback(const char *region_uuid,
+                          struct o2hb_callback_func *hc);
+void o2hb_unregister_callback(const char *region_uuid,
+                             struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
                        unsigned bytes);
 void o2hb_init(void);
index 9f5ad0f01ce0d885e787003d07c4fa0907139dc0..af2070da308b87904c468988ba94cae0ee7cf8ae 100644 (file)
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
        },
 };
 
+int o2nm_depend_item(struct config_item *item)
+{
+       return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+void o2nm_undepend_item(struct config_item *item)
+{
+       configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+int o2nm_depend_this_node(void)
+{
+       int ret = 0;
+       struct o2nm_node *local_node;
+
+       local_node = o2nm_get_node_by_num(o2nm_this_node());
+       if (!local_node) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = o2nm_depend_item(&local_node->nd_item);
+       o2nm_node_put(local_node);
+
+out:
+       return ret;
+}
+
+void o2nm_undepend_this_node(void)
+{
+       struct o2nm_node *local_node;
+
+       local_node = o2nm_get_node_by_num(o2nm_this_node());
+       BUG_ON(!local_node);
+
+       o2nm_undepend_item(&local_node->nd_item);
+       o2nm_node_put(local_node);
+}
+
+
 static void __exit exit_o2nm(void)
 {
        if (ocfs2_table_header)
@@ -934,7 +974,7 @@ static int __init init_o2nm(void)
                goto out_sysctl;
 
        config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
-       init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+       mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
        ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
        if (ret) {
                printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
index 070522138ae26c1b10f0f50683a48b5c53f5bf8e..7c860361b8ddc944a84c7b05ddf760df99b6cdd2 100644 (file)
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
 void o2nm_node_get(struct o2nm_node *node);
 void o2nm_node_put(struct o2nm_node *node);
 
+int o2nm_depend_item(struct config_item *item);
+void o2nm_undepend_item(struct config_item *item);
+int o2nm_depend_this_node(void);
+void o2nm_undepend_this_node(void);
+
 #endif /* O2CLUSTER_NODEMANAGER_H */
index 0b229a9c7952612e2f30098c3085accbb8b167f9..f0bdfd944c44f53f35f7a11e00db26a2c16f87dc 100644 (file)
@@ -261,14 +261,12 @@ out:
 
 static void o2net_complete_nodes_nsw(struct o2net_node *nn)
 {
-       struct list_head *iter, *tmp;
+       struct o2net_status_wait *nsw, *tmp;
        unsigned int num_kills = 0;
-       struct o2net_status_wait *nsw;
 
        assert_spin_locked(&nn->nn_lock);
 
-       list_for_each_safe(iter, tmp, &nn->nn_status_list) {
-               nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
+       list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
                o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
                num_kills++;
        }
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
 
 void o2net_unregister_handler_list(struct list_head *list)
 {
-       struct list_head *pos, *n;
-       struct o2net_msg_handler *nmh;
+       struct o2net_msg_handler *nmh, *n;
 
        write_lock(&o2net_handler_lock);
-       list_for_each_safe(pos, n, list) {
-               nmh = list_entry(pos, struct o2net_msg_handler,
-                                nh_unregister_item);
+       list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
                mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
                     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
                rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 
 void o2net_unregister_hb_callbacks(void)
 {
-       o2hb_unregister_callback(&o2net_hb_up);
-       o2hb_unregister_callback(&o2net_hb_down);
+       o2hb_unregister_callback(NULL, &o2net_hb_up);
+       o2hb_unregister_callback(NULL, &o2net_hb_down);
 }
 
 int o2net_register_hb_callbacks(void)
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void)
        o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
                            o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
 
-       ret = o2hb_register_callback(&o2net_hb_up);
+       ret = o2hb_register_callback(NULL, &o2net_hb_up);
        if (ret == 0)
-               ret = o2hb_register_callback(&o2net_hb_down);
+               ret = o2hb_register_callback(NULL, &o2net_hb_down);
 
        if (ret)
                o2net_unregister_hb_callbacks();
index c441ef1f2badab7a2d8b2cc9cba5a0ac85ee9bb3..0d5fdde959c8018a2d244ae678d16fde1e5b3f33 100644 (file)
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
                u32 offset = OCFS2_I(dir)->ip_clusters;
 
                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-                                                   1, parent_fe_bh, handle,
+                                                   1, 0, parent_fe_bh, handle,
                                                    data_ac, meta_ac, NULL);
                BUG_ON(status == -EAGAIN);
                if (status < 0) {
index d836b98dd99a42614ab0e1afbb4c5f459ef708ae..6954565b8ccb7cac2f895d79292987cb8d8a77ad 100644 (file)
@@ -1128,8 +1128,8 @@ bail:
 
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-       o2hb_unregister_callback(&dlm->dlm_hb_up);
-       o2hb_unregister_callback(&dlm->dlm_hb_down);
+       o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+       o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
        o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
 
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
 
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-       status = o2hb_register_callback(&dlm->dlm_hb_down);
+       status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
        if (status)
                goto bail;
 
        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-       status = o2hb_register_callback(&dlm->dlm_hb_up);
+       status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
        if (status)
                goto bail;
 
index 6edffca99d98981130e29a0fd7acc11d222a5ba6..65b2b9b9268854001da953a26720afbead6fc154 100644 (file)
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 {
        struct dlm_master_list_entry *mle;
-       struct list_head *iter;
        
        mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
        spin_lock(&dlm->master_lock);
-       list_for_each(iter, &dlm->master_list) {
-               mle = list_entry(iter, struct dlm_master_list_entry, list);
+       list_for_each_entry(mle, &dlm->master_list, list)
                dlm_print_one_mle(mle);
-       }
        spin_unlock(&dlm->master_lock);
 }
 
 int dlm_dump_all_mles(const char __user *data, unsigned int len)
 {
-       struct list_head *iter;
        struct dlm_ctxt *dlm;
 
        spin_lock(&dlm_domain_lock);
-       list_for_each(iter, &dlm_domains) {
-               dlm = list_entry (iter, struct dlm_ctxt, list);
+       list_for_each_entry(dlm, &dlm_domains, list) {
                mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
                dlm_dump_mles(dlm);
        }
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
                        char *name, unsigned int namelen)
 {
        struct dlm_master_list_entry *tmpmle;
-       struct list_head *iter;
 
        assert_spin_locked(&dlm->master_lock);
 
-       list_for_each(iter, &dlm->master_list) {
-               tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
+       list_for_each_entry(tmpmle, &dlm->master_list, list) {
                if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
                        continue;
                dlm_get_mle(tmpmle);
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
 {
        struct dlm_master_list_entry *mle;
-       struct list_head *iter;
 
        assert_spin_locked(&dlm->spinlock);
        
-       list_for_each(iter, &dlm->mle_hb_events) {
-               mle = list_entry(iter, struct dlm_master_list_entry, 
-                                hb_events);
+       list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
                if (node_up)
                        dlm_mle_node_up(dlm, mle, NULL, idx);
                else
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        int ret;
        int i;
        int count = 0;
-       struct list_head *queue, *iter;
+       struct list_head *queue;
        struct dlm_lock *lock;
 
        assert_spin_locked(&res->spinlock);
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        ret = 0;
        queue = &res->granted;
        for (i = 0; i < 3; i++) {
-               list_for_each(iter, queue) {
-                       lock = list_entry(iter, struct dlm_lock, list);
+               list_for_each_entry(lock, queue, list) {
                        ++count;
                        if (lock->ml.node == dlm->node_num) {
                                mlog(0, "found a lock owned by this node still "
@@ -2923,18 +2912,16 @@ again:
 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                                      struct dlm_lock_resource *res)
 {
-       struct list_head *iter, *iter2;
        struct list_head *queue = &res->granted;
        int i, bit;
-       struct dlm_lock *lock;
+       struct dlm_lock *lock, *next;
 
        assert_spin_locked(&res->spinlock);
 
        BUG_ON(res->owner == dlm->node_num);
 
        for (i=0; i<3; i++) {
-               list_for_each_safe(iter, iter2, queue) {
-                       lock = list_entry (iter, struct dlm_lock, list);
+               list_for_each_entry_safe(lock, next, queue, list) {
                        if (lock->ml.node != dlm->node_num) {
                                mlog(0, "putting lock for node %u\n",
                                     lock->ml.node);
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
 {
        int i;
        struct list_head *queue = &res->granted;
-       struct list_head *iter;
        struct dlm_lock *lock;
        int nodenum;
 
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
 
        spin_lock(&res->spinlock);
        for (i=0; i<3; i++) {
-               list_for_each(iter, queue) {
+               list_for_each_entry(lock, queue, list) {
                        /* up to the caller to make sure this node
                         * is alive */
-                       lock = list_entry (iter, struct dlm_lock, list);
                        if (lock->ml.node != dlm->node_num) {
                                spin_unlock(&res->spinlock);
                                return lock->ml.node;
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 
 void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
 {
-       struct list_head *iter, *iter2;
-       struct dlm_master_list_entry *mle;
+       struct dlm_master_list_entry *mle, *next;
        struct dlm_lock_resource *res;
        unsigned int hash;
 
@@ -3245,9 +3229,7 @@ top:
 
        /* clean the master list */
        spin_lock(&dlm->master_lock);
-       list_for_each_safe(iter, iter2, &dlm->master_list) {
-               mle = list_entry(iter, struct dlm_master_list_entry, list);
-
+       list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
                BUG_ON(mle->type != DLM_MLE_BLOCK &&
                       mle->type != DLM_MLE_MASTER &&
                       mle->type != DLM_MLE_MIGRATION);
index 671c4ed58ee265dd97a5d72be35c52ff685c7d3b..a2c33160bfd6838dd17487b49492f6a85e5b3539 100644 (file)
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work)
        struct dlm_ctxt *dlm =
                container_of(work, struct dlm_ctxt, dispatched_work);
        LIST_HEAD(tmp_list);
-       struct list_head *iter, *iter2;
-       struct dlm_work_item *item;
+       struct dlm_work_item *item, *next;
        dlm_workfunc_t *workfunc;
        int tot=0;
 
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work)
        list_splice_init(&dlm->work_list, &tmp_list);
        spin_unlock(&dlm->work_lock);
 
-       list_for_each_safe(iter, iter2, &tmp_list) {
+       list_for_each_entry(item, &tmp_list, list) {
                tot++;
        }
        mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
 
-       list_for_each_safe(iter, iter2, &tmp_list) {
-               item = list_entry(iter, struct dlm_work_item, list);
+       list_for_each_entry_safe(item, next, &tmp_list, list) {
                workfunc = item->func;
                list_del_init(&item->list);
 
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 {
        int status = 0;
        struct dlm_reco_node_data *ndata;
-       struct list_head *iter;
        int all_nodes_done;
        int destroy = 0;
        int pass = 0;
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 
        /* safe to access the node data list without a lock, since this
         * process is the only one to change the list */
-       list_for_each(iter, &dlm->reco.node_data) {
-               ndata = list_entry (iter, struct dlm_reco_node_data, list);
+       list_for_each_entry(ndata, &dlm->reco.node_data, list) {
                BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
                ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
 
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                 * done, or if anyone died */
                all_nodes_done = 1;
                spin_lock(&dlm_reco_state_lock);
-               list_for_each(iter, &dlm->reco.node_data) {
-                       ndata = list_entry (iter, struct dlm_reco_node_data, list);
-
+               list_for_each_entry(ndata, &dlm->reco.node_data, list) {
                        mlog(0, "checking recovery state of node %u\n",
                             ndata->node_num);
                        switch (ndata->state) {
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 
 static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 {
-       struct list_head *iter, *iter2;
-       struct dlm_reco_node_data *ndata;
+       struct dlm_reco_node_data *ndata, *next;
        LIST_HEAD(tmplist);
 
        spin_lock(&dlm_reco_state_lock);
        list_splice_init(&dlm->reco.node_data, &tmplist);
        spin_unlock(&dlm_reco_state_lock);
 
-       list_for_each_safe(iter, iter2, &tmplist) {
-               ndata = list_entry (iter, struct dlm_reco_node_data, list);
+       list_for_each_entry_safe(ndata, next, &tmplist, list) {
                list_del_init(&ndata->list);
                kfree(ndata);
        }
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        struct dlm_lock_resource *res;
        struct dlm_ctxt *dlm;
        LIST_HEAD(resources);
-       struct list_head *iter;
        int ret;
        u8 dead_node, reco_master;
        int skip_all_done = 0;
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 
        /* any errors returned will be due to the new_master dying,
         * the dlm_reco_thread should detect this */
-       list_for_each(iter, &resources) {
-               res = list_entry (iter, struct dlm_lock_resource, recovering);
+       list_for_each_entry(res, &resources, recovering) {
                ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
                                        DLM_MRES_RECOVERY);
                if (ret < 0) {
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
-       struct list_head *iter;
        struct dlm_reco_node_data *ndata = NULL;
        int ret = -EINVAL;
 
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
                        dlm->reco.dead_node, done->node_idx, dlm->node_num);
 
        spin_lock(&dlm_reco_state_lock);
-       list_for_each(iter, &dlm->reco.node_data) {
-               ndata = list_entry (iter, struct dlm_reco_node_data, list);
+       list_for_each_entry(ndata, &dlm->reco.node_data, list) {
                if (ndata->node_num != done->node_idx)
                        continue;
 
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                                        struct list_head *list,
                                        u8 dead_node)
 {
-       struct dlm_lock_resource *res;
-       struct list_head *iter, *iter2;
+       struct dlm_lock_resource *res, *next;
        struct dlm_lock *lock;
 
        spin_lock(&dlm->spinlock);
-       list_for_each_safe(iter, iter2, &dlm->reco.resources) {
-               res = list_entry (iter, struct dlm_lock_resource, recovering);
+       list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
                /* always prune any $RECOVERY entries for dead nodes,
                 * otherwise hangs can occur during later recovery */
                if (dlm_is_recovery_lock(res->lockname.name,
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
                                        u8 flags, u8 master)
 {
        /* mres here is one full page */
-       memset(mres, 0, PAGE_SIZE);
+       clear_page(mres);
        mres->lockname_len = namelen;
        memcpy(mres->lockname, lockname, namelen);
        mres->num_locks = 0;
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         struct dlm_migratable_lockres *mres,
                         u8 send_to, u8 flags)
 {
-       struct list_head *queue, *iter;
+       struct list_head *queue;
        int total_locks, i;
        u64 mig_cookie = 0;
        struct dlm_lock *lock;
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        total_locks = 0;
        for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
                queue = dlm_list_idx_to_ptr(res, i);
-               list_for_each(iter, queue) {
-                       lock = list_entry (iter, struct dlm_lock, list);
-
+               list_for_each_entry(lock, queue, list) {
                        /* add another lock. */
                        total_locks++;
                        if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
        struct dlm_lockstatus *lksb = NULL;
        int ret = 0;
        int i, j, bad;
-       struct list_head *iter;
        struct dlm_lock *lock = NULL;
        u8 from = O2NM_MAX_NODES;
        unsigned int added = 0;
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
                                tmpq = dlm_list_idx_to_ptr(res, j);
-                               list_for_each(iter, tmpq) {
-                                       lock = list_entry (iter,&nbs