ARM: EXYNOS: Annotate iomem and pm_data pointers __ro_after_init
[sfrench/cifs-2.6.git] / fs / read_write.c
index 190e0d362581a9a87d5cdae2a069e0d7e8991af9..5816d4c4cab09c22274c2cdc40f4176f16426f0a 100644 (file)
@@ -20,7 +20,7 @@
 #include <linux/fs.h>
 #include "internal.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
@@ -1538,28 +1538,43 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
        if (len == 0)
                return 0;
 
-       ret = mnt_want_write_file(file_out);
-       if (ret)
-               return ret;
+       sb_start_write(inode_out->i_sb);
 
-       ret = -EOPNOTSUPP;
-       if (file_out->f_op->copy_file_range)
+       /*
+        * Try cloning first, this is supported by more file systems, and
+        * more efficient if both clone and copy are supported (e.g. NFS).
+        */
+       if (file_in->f_op->clone_file_range) {
+               ret = file_in->f_op->clone_file_range(file_in, pos_in,
+                               file_out, pos_out, len);
+               if (ret == 0) {
+                       ret = len;
+                       goto done;
+               }
+       }
+
+       if (file_out->f_op->copy_file_range) {
                ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
                                                      pos_out, len, flags);
-       if (ret == -EOPNOTSUPP)
-               ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-                               len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+               if (ret != -EOPNOTSUPP)
+                       goto done;
+       }
+
+       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+                       len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
 
+done:
        if (ret > 0) {
                fsnotify_access(file_in);
                add_rchar(current, ret);
                fsnotify_modify(file_out);
                add_wchar(current, ret);
        }
+
        inc_syscr(current);
        inc_syscw(current);
 
-       mnt_drop_write_file(file_out);
+       sb_end_write(inode_out->i_sb);
 
        return ret;
 }
@@ -1650,6 +1665,115 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
        return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
 }
 
+/*
+ * Check that the two inodes are eligible for cloning, the ranges make
+ * sense, and then flush all dirty data.  Caller must ensure that the
+ * inodes have been locked against any other modifications.
+ *
+ * Returns: 0 for "nothing to clone", 1 for "something to clone", or
+ * the usual negative error code.
+ */
+int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
+                              struct inode *inode_out, loff_t pos_out,
+                              u64 *len, bool is_dedupe)
+{
+       loff_t bs = inode_out->i_sb->s_blocksize;
+       loff_t blen;
+       loff_t isize;
+       bool same_inode = (inode_in == inode_out);
+       int ret;
+
+       /* Don't touch certain kinds of inodes */
+       if (IS_IMMUTABLE(inode_out))
+               return -EPERM;
+
+       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+               return -ETXTBSY;
+
+       /* Don't reflink dirs, pipes, sockets... */
+       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+               return -EISDIR;
+       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+               return -EINVAL;
+
+       /* Are we going all the way to the end? */
+       isize = i_size_read(inode_in);
+       if (isize == 0)
+               return 0;
+
+       /* Zero length dedupe exits immediately; reflink goes to EOF. */
+       if (*len == 0) {
+               if (is_dedupe || pos_in == isize)
+                       return 0;
+               if (pos_in > isize)
+                       return -EINVAL;
+               *len = isize - pos_in;
+       }
+
+       /* Ensure offsets don't wrap and the input is inside i_size */
+       if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
+           pos_in + *len > isize)
+               return -EINVAL;
+
+       /* Don't allow dedupe past EOF in the dest file */
+       if (is_dedupe) {
+               loff_t  disize;
+
+               disize = i_size_read(inode_out);
+               if (pos_out >= disize || pos_out + *len > disize)
+                       return -EINVAL;
+       }
+
+       /* If we're linking to EOF, continue to the block boundary. */
+       if (pos_in + *len == isize)
+               blen = ALIGN(isize, bs) - pos_in;
+       else
+               blen = *len;
+
+       /* Only reflink if we're aligned to block boundaries */
+       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+           !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+               return -EINVAL;
+
+       /* Don't allow overlapped reflink within the same file */
+       if (same_inode) {
+               if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+                       return -EINVAL;
+       }
+
+       /* Wait for the completion of any pending IOs on both files */
+       inode_dio_wait(inode_in);
+       if (!same_inode)
+               inode_dio_wait(inode_out);
+
+       ret = filemap_write_and_wait_range(inode_in->i_mapping,
+                       pos_in, pos_in + *len - 1);
+       if (ret)
+               return ret;
+
+       ret = filemap_write_and_wait_range(inode_out->i_mapping,
+                       pos_out, pos_out + *len - 1);
+       if (ret)
+               return ret;
+
+       /*
+        * Check that the extents are the same.
+        */
+       if (is_dedupe) {
+               bool            is_same = false;
+
+               ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+                               inode_out, pos_out, *len, &is_same);
+               if (ret)
+                       return ret;
+               if (!is_same)
+                       return -EBADE;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+
 int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                struct file *file_out, loff_t pos_out, u64 len)
 {
@@ -1657,15 +1781,19 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
        struct inode *inode_out = file_inode(file_out);
        int ret;
 
-       if (inode_in->i_sb != inode_out->i_sb ||
-           file_in->f_path.mnt != file_out->f_path.mnt)
-               return -EXDEV;
-
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;
 
+       /*
+        * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
+        * the same mount. Practically, they only need to be on the same file
+        * system.
+        */
+       if (inode_in->i_sb != inode_out->i_sb)
+               return -EXDEV;
+
        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
            (file_out->f_flags & O_APPEND))
@@ -1685,10 +1813,6 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
        if (pos_in + len > i_size_read(inode_in))
                return -EINVAL;
 
-       ret = mnt_want_write_file(file_out);
-       if (ret)
-               return ret;
-
        ret = file_in->f_op->clone_file_range(file_in, pos_in,
                        file_out, pos_out, len);
        if (!ret) {
@@ -1696,11 +1820,106 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                fsnotify_modify(file_out);
        }
 
-       mnt_drop_write_file(file_out);
        return ret;
 }
 EXPORT_SYMBOL(vfs_clone_file_range);
 
+/*
+ * Read a page's worth of file data into the page cache.  Return the page
+ * locked.
+ */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+       struct address_space *mapping;
+       struct page *page;
+       pgoff_t n;
+
+       n = offset >> PAGE_SHIFT;
+       mapping = inode->i_mapping;
+       page = read_mapping_page(mapping, n, NULL);
+       if (IS_ERR(page))
+               return page;
+       if (!PageUptodate(page)) {
+               put_page(page);
+               return ERR_PTR(-EIO);
+       }
+       lock_page(page);
+       return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+                                 struct inode *dest, loff_t destoff,
+                                 loff_t len, bool *is_same)
+{
+       loff_t src_poff;
+       loff_t dest_poff;
+       void *src_addr;
+       void *dest_addr;
+       struct page *src_page;
+       struct page *dest_page;
+       loff_t cmp_len;
+       bool same;
+       int error;
+
+       error = -EINVAL;
+       same = true;
+       while (len) {
+               src_poff = srcoff & (PAGE_SIZE - 1);
+               dest_poff = destoff & (PAGE_SIZE - 1);
+               cmp_len = min(PAGE_SIZE - src_poff,
+                             PAGE_SIZE - dest_poff);
+               cmp_len = min(cmp_len, len);
+               if (cmp_len <= 0)
+                       goto out_error;
+
+               src_page = vfs_dedupe_get_page(src, srcoff);
+               if (IS_ERR(src_page)) {
+                       error = PTR_ERR(src_page);
+                       goto out_error;
+               }
+               dest_page = vfs_dedupe_get_page(dest, destoff);
+               if (IS_ERR(dest_page)) {
+                       error = PTR_ERR(dest_page);
+                       unlock_page(src_page);
+                       put_page(src_page);
+                       goto out_error;
+               }
+               src_addr = kmap_atomic(src_page);
+               dest_addr = kmap_atomic(dest_page);
+
+               flush_dcache_page(src_page);
+               flush_dcache_page(dest_page);
+
+               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+                       same = false;
+
+               kunmap_atomic(dest_addr);
+               kunmap_atomic(src_addr);
+               unlock_page(dest_page);
+               unlock_page(src_page);
+               put_page(dest_page);
+               put_page(src_page);
+
+               if (!same)
+                       break;
+
+               srcoff += cmp_len;
+               destoff += cmp_len;
+               len -= cmp_len;
+       }
+
+       *is_same = same;
+       return 0;
+
+out_error:
+       return error;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
+
 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
 {
        struct file_dedupe_range_info *info;
@@ -1737,6 +1956,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
                goto out;
        ret = 0;
 
+       if (off + len > i_size_read(src))
+               return -EINVAL;
+
        /* pre-format output fields to sane values */
        for (i = 0; i < count; i++) {
                same->info[i].bytes_deduped = 0ULL;