[PATCH] r/o bind mounts: elevate write count for open()s

author Dave Hansen <haveblue@us.ibm.com>

Fri, 15 Feb 2008 22:37:48 +0000 (14:37 -0800)

committer Al Viro <viro@zeniv.linux.org.uk>

Sat, 19 Apr 2008 04:29:25 +0000 (00:29 -0400)
author Dave Hansen <haveblue@us.ibm.com>
Fri, 15 Feb 2008 22:37:48 +0000 (14:37 -0800)
committer Al Viro <viro@zeniv.linux.org.uk>
Sat, 19 Apr 2008 04:29:25 +0000 (00:29 -0400)
diff --git a/fs/file_table.c b/fs/file_table.c

index 3f73eb1f195a12638cf00fa276b39cd6eaa9f18e..71efc7000226789e54d7d1dfb7874638ba619f81 100644 (file)
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -199,6 +199,17 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
         file->f_mapping = dentry->d_inode->i_mapping;
         file->f_mode = mode;
         file->f_op = fop;
+
+       /*
+        * These mounts don't really matter in practice
+        * for r/o bind mounts.  They aren't userspace-
+        * visible.  We do this for consistency, and so
+        * that we can do debugging checks at __fput()
+        */
+       if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+               error = mnt_want_write(mnt);
+               WARN_ON(error);
+       }
         return error;
  }
  EXPORT_SYMBOL(init_file);
@@ -221,10 +232,13 @@ EXPORT_SYMBOL(fput);
   */
  void drop_file_write_access(struct file *file)
  {
+       struct vfsmount *mnt = file->f_path.mnt;
         struct dentry *dentry = file->f_path.dentry;
         struct inode *inode = dentry->d_inode;
  
         put_write_access(inode);
+       if (!special_file(inode->i_mode))
+               mnt_drop_write(mnt);
  }
  EXPORT_SYMBOL_GPL(drop_file_write_access);
  
diff --git a/fs/namei.c b/fs/namei.c

index 83c843b3fea31fa8d95897cd6b4e9c85fe8aa288..e179f71bfcb058df613f83cb1ff923d47ffc4618 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
                         return -EACCES;
  
                 flag &= ~O_TRUNC;
-       } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
-               return -EROFS;
+       }
  
         error = vfs_permission(nd, acc_mode);
         if (error)
@@ -1724,18 +1723,32 @@ static inline int open_to_namei_flags(int flag)
         return flag;
  }
  
+static int open_will_write_to_fs(int flag, struct inode *inode)
+{
+       /*
+        * We'll never write to the fs underlying
+        * a device file.
+        */
+       if (special_file(inode->i_mode))
+               return 0;
+       return (flag & O_TRUNC);
+}
+
  /*
- * Note that the low bits of "flag" aren't the same as in the open
- * system call.  See open_to_namei_flags().
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
   */
  struct file *do_filp_open(int dfd, const char *pathname,
                 int open_flag, int mode)
  {
+       struct file *filp;
         struct nameidata nd;
         int acc_mode, error;
         struct path path;
         struct dentry *dir;
         int count = 0;
+       int will_write;
         int flag = open_to_namei_flags(open_flag);
  
         acc_mode = ACC_MODE(flag);
@@ -1791,17 +1804,30 @@ do_last:
         }
  
         if (IS_ERR(nd.intent.open.file)) {
-               mutex_unlock(&dir->d_inode->i_mutex);
                 error = PTR_ERR(nd.intent.open.file);
-               goto exit_dput;
+               goto exit_mutex_unlock;
         }
  
         /* Negative dentry, just create the file */
         if (!path.dentry->d_inode) {
-               error = __open_namei_create(&nd, &path, flag, mode);
+               /*
+                * This write is needed to ensure that a
+                * ro->rw transition does not occur between
+                * the time when the file is created and when
+                * a permanent write count is taken through
+                * the 'struct file' in nameidata_to_filp().
+                */
+               error = mnt_want_write(nd.path.mnt);
                 if (error)
+                       goto exit_mutex_unlock;
+               error = __open_namei_create(&nd, &path, flag, mode);
+               if (error) {
+                       mnt_drop_write(nd.path.mnt);
                         goto exit;
-               return nameidata_to_filp(&nd, open_flag);
+               }
+               filp = nameidata_to_filp(&nd, open_flag);
+               mnt_drop_write(nd.path.mnt);
+               return filp;
         }
  
         /*
@@ -1831,11 +1857,40 @@ do_last:
         if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
                 goto exit;
  ok:
+       /*
+        * Consider:
+        * 1. may_open() truncates a file
+        * 2. a rw->ro mount transition occurs
+        * 3. nameidata_to_filp() fails due to
+        *    the ro mount.
+        * That would be inconsistent, and should
+        * be avoided. Taking this mnt write here
+        * ensures that (2) can not occur.
+        */
+       will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
+       if (will_write) {
+               error = mnt_want_write(nd.path.mnt);
+               if (error)
+                       goto exit;
+       }
         error = may_open(&nd, acc_mode, flag);
-       if (error)
+       if (error) {
+               if (will_write)
+                       mnt_drop_write(nd.path.mnt);
                 goto exit;
-       return nameidata_to_filp(&nd, open_flag);
+       }
+       filp = nameidata_to_filp(&nd, open_flag);
+       /*
+        * It is now safe to drop the mnt write
+        * because the filp has had a write taken
+        * on its behalf.
+        */
+       if (will_write)
+               mnt_drop_write(nd.path.mnt);
+       return filp;
  
+exit_mutex_unlock:
+       mutex_unlock(&dir->d_inode->i_mutex);
  exit_dput:
         path_put_conditional(&path, &nd);
  exit:
diff --git a/fs/open.c b/fs/open.c

index 8111947905d8ebe814a0f38fd17eb7ff6b4f4135..e12f1701032428ae3dd8476043a0103bac2f385f 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -730,6 +730,35 @@ out:
         return error;
  }
  
+/*
+ * You have to be very careful that these write
+ * counts get cleaned up in error cases and
+ * upon __fput().  This should probably never
+ * be called outside of __dentry_open().
+ */
+static inline int __get_file_write_access(struct inode *inode,
+                                         struct vfsmount *mnt)
+{
+       int error;
+       error = get_write_access(inode);
+       if (error)
+               return error;
+       /*
+        * Do not take mount writer counts on
+        * special files since no writes to
+        * the mount itself will occur.
+        */
+       if (!special_file(inode->i_mode)) {
+               /*
+                * Balanced in __fput()
+                */
+               error = mnt_want_write(mnt);
+               if (error)
+                       put_write_access(inode);
+       }
+       return error;
+}
+
  static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                         int flags, struct file *f,
                                         int (*open)(struct inode *, struct file *))
@@ -742,7 +771,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                 FMODE_PREAD | FMODE_PWRITE;
         inode = dentry->d_inode;
         if (f->f_mode & FMODE_WRITE) {
-               error = get_write_access(inode);
+               error = __get_file_write_access(inode, mnt);
                 if (error)
                         goto cleanup_file;
         }
@@ -784,8 +813,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
  
  cleanup_all:
         fops_put(f->f_op);
-       if (f->f_mode & FMODE_WRITE)
+       if (f->f_mode & FMODE_WRITE) {
                 put_write_access(inode);
+               if (!special_file(inode->i_mode))
+                       mnt_drop_write(mnt);
+       }
         file_kill(f);
         f->f_path.dentry = NULL;
         f->f_path.mnt = NULL;
diff --git a/ipc/mqueue.c b/ipc/mqueue.c

index 34262c11f48057a1bd2c48fc83c272c443f5a468..94fd3b08fb77036d35ecd1a337785f1847fe1613 100644 (file)
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -598,6 +598,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
                         int oflag, mode_t mode, struct mq_attr __user *u_attr)
  {
         struct mq_attr attr;
+       struct file *result;
         int ret;
  
         if (u_attr) {
@@ -612,13 +613,24 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
         }
  
         mode &= ~current->fs->umask;
+       ret = mnt_want_write(mqueue_mnt);
+       if (ret)
+               goto out;
         ret = vfs_create(dir->d_inode, dentry, mode, NULL);
         dentry->d_fsdata = NULL;
         if (ret)
-               goto out;
+               goto out_drop_write;
  
-       return dentry_open(dentry, mqueue_mnt, oflag);
+       result = dentry_open(dentry, mqueue_mnt, oflag);
+       /*
+        * dentry_open() took a persistent mnt_want_write(),
+        * so we can now drop this one.
+        */
+       mnt_drop_write(mqueue_mnt);
+       return result;
  
+out_drop_write:
+       mnt_drop_write(mqueue_mnt);
  out:
         dput(dentry);
         mntput(mqueue_mnt);
author	Dave Hansen <haveblue@us.ibm.com>
	Fri, 15 Feb 2008 22:37:48 +0000 (14:37 -0800)
committer	Al Viro <viro@zeniv.linux.org.uk>
	Sat, 19 Apr 2008 04:29:25 +0000 (00:29 -0400)
fs/file_table.c		patch \| blob \| history
fs/namei.c		patch \| blob \| history
fs/open.c		patch \| blob \| history
ipc/mqueue.c		patch \| blob \| history