dax: rip out get_block based IO support
authorJan Kara <jack@suse.cz>
Mon, 21 Nov 2016 01:48:36 +0000 (20:48 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Mon, 21 Nov 2016 01:48:36 +0000 (20:48 -0500)
No one uses functions using the get_block callback anymore. Rip them
out and update documentation.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Documentation/filesystems/dax.txt
fs/dax.c
include/linux/dax.h

index 23d18b8a49d52ff6fec5362aae67abbe7adb102e..a7e6e14aeb08f3eaba96c9cfa0a00ebd610d6573 100644 (file)
@@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
 Filesystem support consists of
 - adding support to mark inodes as being DAX by setting the S_DAX flag in
   i_flags
-- implementing the direct_IO address space operation, and calling
-  dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
+- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
+  when inode has S_DAX flag set
 - implementing an mmap file operation for DAX files which sets the
   VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
-  include handlers for fault, pmd_fault and page_mkwrite (which should
-  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
-  appropriate get_block() callback)
-- calling dax_truncate_page() instead of block_truncate_page() for DAX files
-- calling dax_zero_page_range() instead of zero_user() for DAX files
+  include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
+  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
+  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
+  iomap operations.
+- calling iomap_zero_range() passing appropriate iomap operations instead of
+  block_truncate_page() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
   truncates and page faults
 
-The get_block() callback passed to the DAX functions may return
-uninitialised extents.  If it does, it must ensure that simultaneous
-calls to get_block() (for example by a page-fault racing with a read()
-or a write()) work correctly.
+The iomap handlers for allocating blocks must make sure that allocated blocks
+are zeroed out and converted to written extents before being returned to avoid
+exposure of uninitialized data through mmap.
 
 These filesystems may be used for inspiration:
 - ext2: see Documentation/filesystems/ext2.txt
index 28af41b9da3ac61027beb189f5a8a42f0ca2c69d..ad131cd2605d6dd64a24a5c13e918caa108a6a48 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
        return page;
 }
 
-static bool buffer_written(struct buffer_head *bh)
-{
-       return buffer_mapped(bh) && !buffer_unwritten(bh);
-}
-
-static sector_t to_sector(const struct buffer_head *bh,
-               const struct inode *inode)
-{
-       sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
-
-       return sector;
-}
-
-static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
-                     loff_t start, loff_t end, get_block_t get_block,
-                     struct buffer_head *bh)
-{
-       loff_t pos = start, max = start, bh_max = start;
-       bool hole = false;
-       struct block_device *bdev = NULL;
-       int rw = iov_iter_rw(iter), rc;
-       long map_len = 0;
-       struct blk_dax_ctl dax = {
-               .addr = ERR_PTR(-EIO),
-       };
-       unsigned blkbits = inode->i_blkbits;
-       sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
-                                                               >> blkbits;
-
-       if (rw == READ)
-               end = min(end, i_size_read(inode));
-
-       while (pos < end) {
-               size_t len;
-               if (pos == max) {
-                       long page = pos >> PAGE_SHIFT;
-                       sector_t block = page << (PAGE_SHIFT - blkbits);
-                       unsigned first = pos - (block << blkbits);
-                       long size;
-
-                       if (pos == bh_max) {
-                               bh->b_size = PAGE_ALIGN(end - pos);
-                               bh->b_state = 0;
-                               rc = get_block(inode, block, bh, rw == WRITE);
-                               if (rc)
-                                       break;
-                               bh_max = pos - first + bh->b_size;
-                               bdev = bh->b_bdev;
-                               /*
-                                * We allow uninitialized buffers for writes
-                                * beyond EOF as those cannot race with faults
-                                */
-                               WARN_ON_ONCE(
-                                       (buffer_new(bh) && block < file_blks) ||
-                                       (rw == WRITE && buffer_unwritten(bh)));
-                       } else {
-                               unsigned done = bh->b_size -
-                                               (bh_max - (pos - first));
-                               bh->b_blocknr += done >> blkbits;
-                               bh->b_size -= done;
-                       }
-
-                       hole = rw == READ && !buffer_written(bh);
-                       if (hole) {
-                               size = bh->b_size - first;
-                       } else {
-                               dax_unmap_atomic(bdev, &dax);
-                               dax.sector = to_sector(bh, inode);
-                               dax.size = bh->b_size;
-                               map_len = dax_map_atomic(bdev, &dax);
-                               if (map_len < 0) {
-                                       rc = map_len;
-                                       break;
-                               }
-                               dax.addr += first;
-                               size = map_len - first;
-                       }
-                       /*
-                        * pos + size is one past the last offset for IO,
-                        * so pos + size can overflow loff_t at extreme offsets.
-                        * Cast to u64 to catch this and get the true minimum.
-                        */
-                       max = min_t(u64, pos + size, end);
-               }
-
-               if (iov_iter_rw(iter) == WRITE) {
-                       len = copy_from_iter_pmem(dax.addr, max - pos, iter);
-               } else if (!hole)
-                       len = copy_to_iter((void __force *) dax.addr, max - pos,
-                                       iter);
-               else
-                       len = iov_iter_zero(max - pos, iter);
-
-               if (!len) {
-                       rc = -EFAULT;
-                       break;
-               }
-
-               pos += len;
-               if (!IS_ERR(dax.addr))
-                       dax.addr += len;
-       }
-
-       dax_unmap_atomic(bdev, &dax);
-
-       return (pos == start) ? rc : pos - start;
-}
-
-/**
- * dax_do_io - Perform I/O to a DAX file
- * @iocb: The control block for this I/O
- * @inode: The file which the I/O is directed at
- * @iter: The addresses to do I/O from or to
- * @get_block: The filesystem method used to translate file offsets to blocks
- * @end_io: A filesystem callback for I/O completion
- * @flags: See below
- *
- * This function uses the same locking scheme as do_blockdev_direct_IO:
- * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
- * caller for writes.  For reads, we take and release the i_mutex ourselves.
- * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
- * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
- * is in progress.
- */
-ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
-                 struct iov_iter *iter, get_block_t get_block,
-                 dio_iodone_t end_io, int flags)
-{
-       struct buffer_head bh;
-       ssize_t retval = -EINVAL;
-       loff_t pos = iocb->ki_pos;
-       loff_t end = pos + iov_iter_count(iter);
-
-       memset(&bh, 0, sizeof(bh));
-       bh.b_bdev = inode->i_sb->s_bdev;
-
-       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-               inode_lock(inode);
-
-       /* Protects against truncate */
-       if (!(flags & DIO_SKIP_DIO_COUNT))
-               inode_dio_begin(inode);
-
-       retval = dax_io(inode, iter, pos, end, get_block, &bh);
-
-       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-               inode_unlock(inode);
-
-       if (end_io) {
-               int err;
-
-               err = end_io(iocb, pos, retval, bh.b_private);
-               if (err)
-                       retval = err;
-       }
-
-       if (!(flags & DIO_SKIP_DIO_COUNT))
-               inode_dio_end(inode);
-       return retval;
-}
-EXPORT_SYMBOL_GPL(dax_do_io);
-
 /*
  * DAX radix tree locking
  */
@@ -919,105 +757,6 @@ static int dax_insert_mapping(struct address_space *mapping,
        return vm_insert_mixed(vma, vaddr, dax.pfn);
 }
 
-/**
- * dax_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * fault handler for DAX files. dax_fault() assumes the caller has done all
- * the necessary locking for the page fault to proceed successfully.
- */
-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block)
-{
-       struct file *file = vma->vm_file;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
-       void *entry;
-       struct buffer_head bh;
-       unsigned long vaddr = (unsigned long)vmf->virtual_address;
-       unsigned blkbits = inode->i_blkbits;
-       sector_t block;
-       pgoff_t size;
-       int error;
-       int major = 0;
-
-       /*
-        * Check whether offset isn't beyond end of file now. Caller is supposed
-        * to hold locks serializing us with truncate / punch hole so this is
-        * a reliable test.
-        */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (vmf->pgoff >= size)
-               return VM_FAULT_SIGBUS;
-
-       memset(&bh, 0, sizeof(bh));
-       block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
-       bh.b_bdev = inode->i_sb->s_bdev;
-       bh.b_size = PAGE_SIZE;
-
-       entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
-       if (IS_ERR(entry)) {
-               error = PTR_ERR(entry);
-               goto out;
-       }
-
-       error = get_block(inode, block, &bh, 0);
-       if (!error && (bh.b_size < PAGE_SIZE))
-               error = -EIO;           /* fs corruption? */
-       if (error)
-               goto unlock_entry;
-
-       if (vmf->cow_page) {
-               struct page *new_page = vmf->cow_page;
-               if (buffer_written(&bh))
-                       error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
-                                       bh.b_size, new_page, vaddr);
-               else
-                       clear_user_highpage(new_page, vaddr);
-               if (error)
-                       goto unlock_entry;
-               if (!radix_tree_exceptional_entry(entry)) {
-                       vmf->page = entry;
-                       return VM_FAULT_LOCKED;
-               }
-               vmf->entry = entry;
-               return VM_FAULT_DAX_LOCKED;
-       }
-
-       if (!buffer_mapped(&bh)) {
-               if (vmf->flags & FAULT_FLAG_WRITE) {
-                       error = get_block(inode, block, &bh, 1);
-                       count_vm_event(PGMAJFAULT);
-                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-                       major = VM_FAULT_MAJOR;
-                       if (!error && (bh.b_size < PAGE_SIZE))
-                               error = -EIO;
-                       if (error)
-                               goto unlock_entry;
-               } else {
-                       return dax_load_hole(mapping, entry, vmf);
-               }
-       }
-
-       /* Filesystem should not return unwritten buffers to us! */
-       WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-       error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
-                       bh.b_size, &entry, vma, vmf);
- unlock_entry:
-       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
-       if (error == -ENOMEM)
-               return VM_FAULT_OOM | major;
-       /* -EBUSY is fine, somebody else faulted on the same PTE */
-       if ((error < 0) && (error != -EBUSY))
-               return VM_FAULT_SIGBUS | major;
-       return VM_FAULT_NOPAGE | major;
-}
-EXPORT_SYMBOL_GPL(dax_fault);
-
 /**
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
@@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
-/**
- * dax_zero_page_range - zero a range within a page of a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @length: The number of bytes to zero
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * This function can be called by a filesystem when it is zeroing part of a
- * page in a DAX file.  This is intended for hole-punch operations.  If
- * you are truncating a file, the helper function dax_truncate_page() may be
- * more convenient.
- */
-int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
-                                                       get_block_t get_block)
-{
-       struct buffer_head bh;
-       pgoff_t index = from >> PAGE_SHIFT;
-       unsigned offset = from & (PAGE_SIZE-1);
-       int err;
-
-       /* Block boundary? Nothing to do */
-       if (!length)
-               return 0;
-       if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
-               return -EINVAL;
-
-       memset(&bh, 0, sizeof(bh));
-       bh.b_bdev = inode->i_sb->s_bdev;
-       bh.b_size = PAGE_SIZE;
-       err = get_block(inode, index, &bh, 0);
-       if (err < 0 || !buffer_written(&bh))
-               return err;
-
-       return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
-                       offset, length);
-}
-EXPORT_SYMBOL_GPL(dax_zero_page_range);
-
-/**
- * dax_truncate_page - handle a partial page being truncated in a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * Similar to block_truncate_page(), this function can be called by a
- * filesystem when it is truncating a DAX file to handle the partial page.
- */
-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
-{
-       unsigned length = PAGE_ALIGN(from) - from;
-       return dax_zero_page_range(inode, from, length, get_block);
-}
-EXPORT_SYMBOL_GPL(dax_truncate_page);
-
 #ifdef CONFIG_FS_IOMAP
 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 {
index 8d1a5c47945f2bf6de635ca340f0226b80a9c3d3..0afade8bd3d7ca0bb33ac75dc11243b097bebf11 100644 (file)
@@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                struct iomap_ops *ops);
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
-                 get_block_t, dio_iodone_t, int flags);
-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                        struct iomap_ops *ops);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
                pgoff_t index, void *entry, bool wake_all);
@@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 }
 #endif
 
-static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-                               pmd_t *pmd, unsigned int flags, get_block_t gb)
-{
-       return VM_FAULT_FALLBACK;
-}
-
 #ifdef CONFIG_FS_DAX_PMD
 static inline unsigned int dax_radix_order(void *entry)
 {
@@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
 }
 #endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {