fs: Use rename lock and RCU for multi-step operations
authorNick Piggin <npiggin@kernel.dk>
Fri, 7 Jan 2011 06:49:37 +0000 (17:49 +1100)
committerNick Piggin <npiggin@kernel.dk>
Fri, 7 Jan 2011 06:50:22 +0000 (17:50 +1100)
The remaining usages for dcache_lock is to allow atomic, multi-step read-side
operations over the directory tree by excluding modifications to the tree.
Also, to walk in the leaf->root direction in the tree where we don't have
a natural d_lock ordering.

This could be accomplished by taking every d_lock, but this would mean a
huge number of locks and actually gets very tricky.

Solve this instead by using the rename seqlock for multi-step read-side
operations, retry in case of a rename so we don't walk up the wrong parent.
Concurrent dentry insertions are not serialised against.  Concurrent deletes
are tricky when walking up the directory: our parent might have been deleted
when dropping locks so also need to check and retry for that.

We can also use the rename lock in cases where livelock is a worry (and it
is introduced in subsequent patch).

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
drivers/staging/pohmelfs/path_entry.c
fs/autofs4/waitq.c
fs/dcache.c
fs/nfs/namespace.c
include/linux/dcache.h

index 8ec83d2dffb75ec23c67d35cce2c5ee272943c1a..bbe42f42ca8f151385cc96457dc914129325cbe7 100644 (file)
@@ -83,10 +83,11 @@ out:
 int pohmelfs_path_length(struct pohmelfs_inode *pi)
 {
        struct dentry *d, *root, *first;
-       int len = 1; /* Root slash */
+       int len;
+       unsigned seq;
 
-       first = d = d_find_alias(&pi->vfs_inode);
-       if (!d) {
+       first = d_find_alias(&pi->vfs_inode);
+       if (!first) {
                dprintk("%s: ino: %llu, mode: %o.\n", __func__, pi->ino, pi->vfs_inode.i_mode);
                return -ENOENT;
        }
@@ -95,6 +96,11 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi)
        root = dget(current->fs->root.dentry);
        spin_unlock(&current->fs->lock);
 
+rename_retry:
+       len = 1; /* Root slash */
+       d = first;
+       seq = read_seqbegin(&rename_lock);
+       rcu_read_lock();
        spin_lock(&dcache_lock);
 
        if (!IS_ROOT(d) && d_unhashed(d))
@@ -105,6 +111,9 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi)
                d = d->d_parent;
        }
        spin_unlock(&dcache_lock);
+       rcu_read_unlock();
+       if (read_seqretry(&rename_lock, seq))
+               goto rename_retry;
 
        dput(root);
        dput(first);
index 2341375386f891e9361ce191e54f9fd3867d1466..4be8f778a4189872a6956be82d57c19ff2ba9e83 100644 (file)
@@ -186,16 +186,25 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 {
        struct dentry *root = sbi->sb->s_root;
        struct dentry *tmp;
-       char *buf = *name;
+       char *buf;
        char *p;
-       int len = 0;
-
+       int len;
+       unsigned seq;
+
+rename_retry:
+       buf = *name;
+       len = 0;
+       seq = read_seqbegin(&rename_lock);
+       rcu_read_lock();
        spin_lock(&dcache_lock);
        for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
                len += tmp->d_name.len + 1;
 
        if (!len || --len > NAME_MAX) {
                spin_unlock(&dcache_lock);
+               rcu_read_unlock();
+               if (read_seqretry(&rename_lock, seq))
+                       goto rename_retry;
                return 0;
        }
 
@@ -209,6 +218,9 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
                strncpy(p, tmp->d_name.name, tmp->d_name.len);
        }
        spin_unlock(&dcache_lock);
+       rcu_read_unlock();
+       if (read_seqretry(&rename_lock, seq))
+               goto rename_retry;
 
        return len;
 }
index a09f0771fd27a3271245e29d073ecf1d4aaaa3a0..a9bc4ecc21e161b8ab2b7dad70b515282ab52a3b 100644 (file)
@@ -80,6 +80,7 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
+EXPORT_SYMBOL(rename_lock);
 EXPORT_SYMBOL(dcache_inode_lock);
 EXPORT_SYMBOL(dcache_lock);
 
@@ -243,6 +244,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(dcache_inode_lock)
        __releases(dcache_lock)
 {
+       dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
        if (parent)
                spin_unlock(&parent->d_lock);
@@ -1017,11 +1019,15 @@ void shrink_dcache_for_umount(struct super_block *sb)
  * Return true if the parent or its subdirectories contain
  * a mount point
  */
 int have_submounts(struct dentry *parent)
 {
-       struct dentry *this_parent = parent;
+       struct dentry *this_parent;
        struct list_head *next;
+       unsigned seq;
+
+rename_retry:
+       this_parent = parent;
+       seq = read_seqbegin(&rename_lock);
 
        spin_lock(&dcache_lock);
        if (d_mountpoint(parent))
@@ -1055,17 +1061,37 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-               next = this_parent->d_u.d_child.next;
+               struct dentry *tmp;
+               struct dentry *child;
+
+               tmp = this_parent->d_parent;
+               rcu_read_lock();
                spin_unlock(&this_parent->d_lock);
-               this_parent = this_parent->d_parent;
+               child = this_parent;
+               this_parent = tmp;
                spin_lock(&this_parent->d_lock);
+               /* might go back up the wrong parent if we have had a rename
+                * or deletion */
+               if (this_parent != child->d_parent ||
+                               read_seqretry(&rename_lock, seq)) {
+                       spin_unlock(&this_parent->d_lock);
+                       spin_unlock(&dcache_lock);
+                       rcu_read_unlock();
+                       goto rename_retry;
+               }
+               rcu_read_unlock();
+               next = child->d_u.d_child.next;
                goto resume;
        }
        spin_unlock(&this_parent->d_lock);
        spin_unlock(&dcache_lock);
+       if (read_seqretry(&rename_lock, seq))
+               goto rename_retry;
        return 0; /* No mount points found in tree */
 positive:
        spin_unlock(&dcache_lock);
+       if (read_seqretry(&rename_lock, seq))
+               goto rename_retry;
        return 1;
 }
 EXPORT_SYMBOL(have_submounts);
@@ -1086,10 +1112,15 @@ EXPORT_SYMBOL(have_submounts);
  */
 static int select_parent(struct dentry * parent)
 {
-       struct dentry *this_parent = parent;
+       struct dentry *this_parent;
        struct list_head *next;
+       unsigned seq;
        int found = 0;
 
+rename_retry:
+       this_parent = parent;
+       seq = read_seqbegin(&rename_lock);
+
        spin_lock(&dcache_lock);
        spin_lock(&this_parent->d_lock);
 repeat:
@@ -1099,7 +1130,6 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-               BUG_ON(this_parent == dentry);
 
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
@@ -1142,17 +1172,32 @@ resume:
         */
        if (this_parent != parent) {
                struct dentry *tmp;
-               next = this_parent->d_u.d_child.next;
+               struct dentry *child;
+
                tmp = this_parent->d_parent;
+               rcu_read_lock();
                spin_unlock(&this_parent->d_lock);
-               BUG_ON(tmp == this_parent);
+               child = this_parent;
                this_parent = tmp;
                spin_lock(&this_parent->d_lock);
+               /* might go back up the wrong parent if we have had a rename
+                * or deletion */
+               if (this_parent != child->d_parent ||
+                               read_seqretry(&rename_lock, seq)) {
+                       spin_unlock(&this_parent->d_lock);
+                       spin_unlock(&dcache_lock);
+                       rcu_read_unlock();
+                       goto rename_retry;
+               }
+               rcu_read_unlock();
+               next = child->d_u.d_child.next;
                goto resume;
        }
 out:
        spin_unlock(&this_parent->d_lock);
        spin_unlock(&dcache_lock);
+       if (read_seqretry(&rename_lock, seq))
+               goto rename_retry;
        return found;
 }
 
@@ -1654,7 +1699,7 @@ EXPORT_SYMBOL(d_add_ci);
 struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 {
        struct dentry * dentry = NULL;
-       unsigned long seq;
+       unsigned seq;
 
         do {
                 seq = read_seqbegin(&rename_lock);
@@ -2290,7 +2335,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
  * @buffer: pointer to the end of the buffer
  * @buflen: pointer to buffer length
  *
- * Caller holds the dcache_lock.
+ * Caller holds the rename_lock.
  *
  * If path is not reachable from the supplied root, then the value of
  * root is changed (without modifying refcounts).
@@ -2377,7 +2422,9 @@ char *__d_path(const struct path *path, struct path *root,
 
        prepend(&res, &buflen, "\0", 1);
        spin_lock(&dcache_lock);
+       write_seqlock(&rename_lock);
        error = prepend_path(path, root, &res, &buflen);
+       write_sequnlock(&rename_lock);
        spin_unlock(&dcache_lock);
 
        if (error)
@@ -2441,10 +2488,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
 
        get_fs_root(current->fs, &root);
        spin_lock(&dcache_lock);
+       write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (error)
                res = ERR_PTR(error);
+       write_sequnlock(&rename_lock);
        spin_unlock(&dcache_lock);
        path_put(&root);
        return res;
@@ -2472,10 +2521,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
 
        get_fs_root(current->fs, &root);
        spin_lock(&dcache_lock);
+       write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (!error && !path_equal(&tmp, &root))
                error = prepend_unreachable(&res, &buflen);
+       write_sequnlock(&rename_lock);
        spin_unlock(&dcache_lock);
        path_put(&root);
        if (error)
@@ -2544,7 +2595,9 @@ char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
        char *retval;
 
        spin_lock(&dcache_lock);
+       write_seqlock(&rename_lock);
        retval = __dentry_path(dentry, buf, buflen);
+       write_sequnlock(&rename_lock);
        spin_unlock(&dcache_lock);
 
        return retval;
@@ -2557,6 +2610,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
        char *retval;
 
        spin_lock(&dcache_lock);
+       write_seqlock(&rename_lock);
        if (d_unlinked(dentry)) {
                p = buf + buflen;
                if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2564,6 +2618,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
                buflen++;
        }
        retval = __dentry_path(dentry, buf, buflen);
+       write_sequnlock(&rename_lock);
        spin_unlock(&dcache_lock);
        if (!IS_ERR(retval) && p)
                *p = '/';       /* restore '/' overriden with '\0' */
@@ -2604,6 +2659,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 
        error = -ENOENT;
        spin_lock(&dcache_lock);
+       write_seqlock(&rename_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
@@ -2612,6 +2668,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 
                prepend(&cwd, &buflen, "\0", 1);
                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
+               write_sequnlock(&rename_lock);
                spin_unlock(&dcache_lock);
 
                if (error)
@@ -2631,8 +2688,10 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                        if (copy_to_user(buf, cwd, len))
                                error = -EFAULT;
                }
-       } else
+       } else {
+               write_sequnlock(&rename_lock);
                spin_unlock(&dcache_lock);
+       }
 
 out:
        path_put(&pwd);
@@ -2660,25 +2719,25 @@ out:
 int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
        int result;
-       unsigned long seq;
+       unsigned seq;
 
        if (new_dentry == old_dentry)
                return 1;
 
-       /*
-        * Need rcu_readlock to protect against the d_parent trashing
-        * due to d_move
-        */
-       rcu_read_lock();
        do {
                /* for restarting inner loop in case of seq retry */
                seq = read_seqbegin(&rename_lock);
+               /*
+                * Need rcu_readlock to protect against the d_parent trashing
+                * due to d_move
+                */
+               rcu_read_lock();
                if (d_ancestor(old_dentry, new_dentry))
                        result = 1;
                else
                        result = 0;
+               rcu_read_unlock();
        } while (read_seqretry(&rename_lock, seq));
-       rcu_read_unlock();
 
        return result;
 }
@@ -2710,9 +2769,13 @@ EXPORT_SYMBOL(path_is_under);
 
 void d_genocide(struct dentry *root)
 {
-       struct dentry *this_parent = root;
+       struct dentry *this_parent;
        struct list_head *next;
+       unsigned seq;
 
+rename_retry:
+       this_parent = root;
+       seq = read_seqbegin(&rename_lock);
        spin_lock(&dcache_lock);
        spin_lock(&this_parent->d_lock);
 repeat:
@@ -2722,6 +2785,7 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
+
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                if (d_unhashed(dentry) || !dentry->d_inode) {
                        spin_unlock(&dentry->d_lock);
@@ -2734,19 +2798,43 @@ resume:
                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
-               dentry->d_count--;
+               if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+                       dentry->d_flags |= DCACHE_GENOCIDE;
+                       dentry->d_count--;
+               }
                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-               next = this_parent->d_u.d_child.next;
-               this_parent->d_count--;
+               struct dentry *tmp;
+               struct dentry *child;
+
+               tmp = this_parent->d_parent;
+               if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
+                       this_parent->d_flags |= DCACHE_GENOCIDE;
+                       this_parent->d_count--;
+               }
+               rcu_read_lock();
                spin_unlock(&this_parent->d_lock);
-               this_parent = this_parent->d_parent;
+               child = this_parent;
+               this_parent = tmp;
                spin_lock(&this_parent->d_lock);
+               /* might go back up the wrong parent if we have had a rename
+                * or deletion */
+               if (this_parent != child->d_parent ||
+                               read_seqretry(&rename_lock, seq)) {
+                       spin_unlock(&this_parent->d_lock);
+                       spin_unlock(&dcache_lock);
+                       rcu_read_unlock();
+                       goto rename_retry;
+               }
+               rcu_read_unlock();
+               next = child->d_u.d_child.next;
                goto resume;
        }
        spin_unlock(&this_parent->d_lock);
        spin_unlock(&dcache_lock);
+       if (read_seqretry(&rename_lock, seq))
+               goto rename_retry;
 }
 
 /**
index db6aa3673cf33b2d720b65ceb728a8a6e3244ab5..78c0ebb0b07c93b40bdbcc07cf9744a3860f9bd6 100644 (file)
@@ -49,11 +49,17 @@ char *nfs_path(const char *base,
               const struct dentry *dentry,
               char *buffer, ssize_t buflen)
 {
-       char *end = buffer+buflen;
+       char *end;
        int namelen;
+       unsigned seq;
 
+rename_retry:
+       end = buffer+buflen;
        *--end = '\0';
        buflen--;
+
+       seq = read_seqbegin(&rename_lock);
+       rcu_read_lock();
        spin_lock(&dcache_lock);
        while (!IS_ROOT(dentry) && dentry != droot) {
                namelen = dentry->d_name.len;
@@ -66,6 +72,9 @@ char *nfs_path(const char *base,
                dentry = dentry->d_parent;
        }
        spin_unlock(&dcache_lock);
+       rcu_read_unlock();
+       if (read_seqretry(&rename_lock, seq))
+               goto rename_retry;
        if (*end != '/') {
                if (--buflen < 0)
                        goto Elong;
@@ -83,6 +92,9 @@ char *nfs_path(const char *base,
        return end;
 Elong_unlock:
        spin_unlock(&dcache_lock);
+       rcu_read_unlock();
+       if (read_seqretry(&rename_lock, seq))
+               goto rename_retry;
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
index bda5ec0b077d9e36eb4b33e5fee0ead7bb37a9ac..c963ebada922a938739697ab1b8cda797570bf7b 100644 (file)
@@ -180,6 +180,7 @@ struct dentry_operations {
 #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */
 
 #define DCACHE_CANT_MOUNT      0x0100
+#define DCACHE_GENOCIDE                0x0200
 
 extern spinlock_t dcache_inode_lock;
 extern spinlock_t dcache_lock;