dax: use common 4k zero page for dax mmap reads

author Ross Zwisler <ross.zwisler@linux.intel.com>

Wed, 6 Sep 2017 23:18:43 +0000 (16:18 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 7 Sep 2017 00:27:24 +0000 (17:27 -0700)
author Ross Zwisler <ross.zwisler@linux.intel.com>
Wed, 6 Sep 2017 23:18:43 +0000 (16:18 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 00:27:24 +0000 (17:27 -0700)
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt

index a7e6e14aeb08f3eaba96c9cfa0a00ebd610d6573..3be3b266be41e8e537b2c2726eb7ad26a97b4dbe 100644 (file)
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -63,9 +63,8 @@ Filesystem support consists of
  - implementing an mmap file operation for DAX files which sets the
    VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
    include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
-  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
-  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
-  iomap operations.
+  handlers should probably call dax_iomap_fault() passing the appropriate
+  fault size and iomap operations.
  - calling iomap_zero_range() passing appropriate iomap operations instead of
    block_truncate_page() for DAX files
  - ensuring that there is sufficient locking between reads, writes,
diff --git a/fs/dax.c b/fs/dax.c

index b8882b5ce6ede7882911d96322398cb58ba94730..ab67ae30ccbf69581ae635700e9e0dd625bb5d05 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -66,7 +66,7 @@ static int dax_is_pte_entry(void *entry)
  
  static int dax_is_zero_entry(void *entry)
  {
-       return (unsigned long)entry & RADIX_DAX_HZP;
+       return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
  }
  
  static int dax_is_empty_entry(void *entry)
@@ -206,7 +206,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
         for (;;) {
                 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
                                           &slot);
-               if (!entry || !radix_tree_exceptional_entry(entry) ||
+               if (!entry ||
+                   WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
                     !slot_locked(mapping, slot)) {
                         if (slotp)
                                 *slotp = slot;
@@ -241,14 +242,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
  }
  
  static void put_locked_mapping_entry(struct address_space *mapping,
-                                    pgoff_t index, void *entry)
+               pgoff_t index)
  {
-       if (!radix_tree_exceptional_entry(entry)) {
-               unlock_page(entry);
-               put_page(entry);
-       } else {
-               dax_unlock_mapping_entry(mapping, index);
-       }
+       dax_unlock_mapping_entry(mapping, index);
  }
  
  /*
@@ -258,7 +254,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
  static void put_unlocked_mapping_entry(struct address_space *mapping,
                                        pgoff_t index, void *entry)
  {
-       if (!radix_tree_exceptional_entry(entry))
+       if (!entry)
                 return;
  
         /* We have to wake up next waiter for the radix tree entry lock */
@@ -266,15 +262,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
  }
  
  /*
- * Find radix tree entry at given index. If it points to a page, return with
- * the page locked. If it points to the exceptional entry, return with the
- * radix tree entry locked. If the radix tree doesn't contain given index,
- * create empty exceptional entry for the index and return with it locked.
+ * Find radix tree entry at given index. If it points to an exceptional entry,
+ * return it with the radix tree entry locked. If the radix tree doesn't
+ * contain given index, create an empty exceptional entry for the index and
+ * return with it locked.
   *
   * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
   * either return that locked entry or will return an error.  This error will
- * happen if there are any 4k entries (either zero pages or DAX entries)
- * within the 2MiB range that we are requesting.
+ * happen if there are any 4k entries within the 2MiB range that we are
+ * requesting.
   *
   * We always favor 4k entries over 2MiB entries. There isn't a flow where we
   * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
@@ -301,18 +297,21 @@ restart:
         spin_lock_irq(&mapping->tree_lock);
         entry = get_unlocked_mapping_entry(mapping, index, &slot);
  
+       if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+               entry = ERR_PTR(-EIO);
+               goto out_unlock;
+       }
+
         if (entry) {
                 if (size_flag & RADIX_DAX_PMD) {
-                       if (!radix_tree_exceptional_entry(entry) ||
-                           dax_is_pte_entry(entry)) {
+                       if (dax_is_pte_entry(entry)) {
                                 put_unlocked_mapping_entry(mapping, index,
                                                 entry);
                                 entry = ERR_PTR(-EEXIST);
                                 goto out_unlock;
                         }
                 } else { /* trying to grab a PTE entry */
-                       if (radix_tree_exceptional_entry(entry) &&
-                           dax_is_pmd_entry(entry) &&
+                       if (dax_is_pmd_entry(entry) &&
                             (dax_is_zero_entry(entry) ||
                              dax_is_empty_entry(entry))) {
                                 pmd_downgrade = true;
@@ -346,7 +345,7 @@ restart:
                                 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
                 if (err) {
                         if (pmd_downgrade)
-                               put_locked_mapping_entry(mapping, index, entry);
+                               put_locked_mapping_entry(mapping, index);
                         return ERR_PTR(err);
                 }
                 spin_lock_irq(&mapping->tree_lock);
@@ -396,21 +395,6 @@ restart:
                 spin_unlock_irq(&mapping->tree_lock);
                 return entry;
         }
-       /* Normal page in radix tree? */
-       if (!radix_tree_exceptional_entry(entry)) {
-               struct page *page = entry;
-
-               get_page(page);
-               spin_unlock_irq(&mapping->tree_lock);
-               lock_page(page);
-               /* Page got truncated? Retry... */
-               if (unlikely(page->mapping != mapping)) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto restart;
-               }
-               return page;
-       }
         entry = lock_slot(mapping, slot);
   out_unlock:
         spin_unlock_irq(&mapping->tree_lock);
@@ -426,7 +410,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
  
         spin_lock_irq(&mapping->tree_lock);
         entry = get_unlocked_mapping_entry(mapping, index, NULL);
-       if (!entry || !radix_tree_exceptional_entry(entry))
+       if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
                 goto out;
         if (!trunc &&
             (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -508,47 +492,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
                                       unsigned long flags)
  {
         struct radix_tree_root *page_tree = &mapping->page_tree;
-       int error = 0;
-       bool hole_fill = false;
         void *new_entry;
         pgoff_t index = vmf->pgoff;
  
         if (vmf->flags & FAULT_FLAG_WRITE)
                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  
-       /* Replacing hole page with block mapping? */
-       if (!radix_tree_exceptional_entry(entry)) {
-               hole_fill = true;
-               /*
-                * Unmap the page now before we remove it from page cache below.
-                * The page is locked so it cannot be faulted in again.
-                */
-               unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-                                   PAGE_SIZE, 0);
-               error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
-               if (error)
-                       return ERR_PTR(error);
-       } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
-               /* replacing huge zero page with PMD block mapping */
-               unmap_mapping_range(mapping,
-                       (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+       if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+               /* we are replacing a zero page with block mapping */
+               if (dax_is_pmd_entry(entry))
+                       unmap_mapping_range(mapping,
+                                       (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
+                                       PMD_SIZE, 0);
+               else /* pte entry */
+                       unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+                                       PAGE_SIZE, 0);
         }
  
         spin_lock_irq(&mapping->tree_lock);
         new_entry = dax_radix_locked_entry(sector, flags);
  
-       if (hole_fill) {
-               __delete_from_page_cache(entry, NULL);
-               /* Drop pagecache reference */
-               put_page(entry);
-               error = __radix_tree_insert(page_tree, index,
-                               dax_radix_order(new_entry), new_entry);
-               if (error) {
-                       new_entry = ERR_PTR(error);
-                       goto unlock;
-               }
-               mapping->nrexceptional++;
-       } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+       if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
                 /*
                  * Only swap our new entry into the radix tree if the current
                  * entry is a zero page or an empty entry.  If a normal PTE or
@@ -565,23 +529,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
                 WARN_ON_ONCE(ret != entry);
                 __radix_tree_replace(page_tree, node, slot,
                                      new_entry, NULL, NULL);
+               entry = new_entry;
         }
+
         if (vmf->flags & FAULT_FLAG_WRITE)
                 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
+
         spin_unlock_irq(&mapping->tree_lock);
-       if (hole_fill) {
-               radix_tree_preload_end();
-               /*
-                * We don't need hole page anymore, it has been replaced with
-                * locked radix tree entry now.
-                */
-               if (mapping->a_ops->freepage)
-                       mapping->a_ops->freepage(entry);
-               unlock_page(entry);
-               put_page(entry);
-       }
-       return new_entry;
+       return entry;
  }
  
  static inline unsigned long
@@ -683,7 +638,7 @@ static int dax_writeback_one(struct block_device *bdev,
         spin_lock_irq(&mapping->tree_lock);
         entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
         /* Entry got punched out / reallocated? */
-       if (!entry2 || !radix_tree_exceptional_entry(entry2))
+       if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
                 goto put_unlocked;
         /*
          * Entry got reallocated elsewhere? No need to writeback. We have to
@@ -755,7 +710,7 @@ static int dax_writeback_one(struct block_device *bdev,
         trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
   dax_unlock:
         dax_read_unlock(id);
-       put_locked_mapping_entry(mapping, index, entry);
+       put_locked_mapping_entry(mapping, index);
         return ret;
  
   put_unlocked:
@@ -830,11 +785,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  
  static int dax_insert_mapping(struct address_space *mapping,
                 struct block_device *bdev, struct dax_device *dax_dev,
-               sector_t sector, size_t size, void **entryp,
+               sector_t sector, size_t size, void *entry,
                 struct vm_area_struct *vma, struct vm_fault *vmf)
  {
         unsigned long vaddr = vmf->address;
-       void *entry = *entryp;
         void *ret, *kaddr;
         pgoff_t pgoff;
         int id, rc;
@@ -855,87 +809,44 @@ static int dax_insert_mapping(struct address_space *mapping,
         ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
         if (IS_ERR(ret))
                 return PTR_ERR(ret);
-       *entryp = ret;
  
         trace_dax_insert_mapping(mapping->host, vmf, ret);
-       return vm_insert_mixed(vma, vaddr, pfn);
-}
-
-/**
- * dax_pfn_mkwrite - handle first write to DAX page
- * @vmf: The description of the fault
- */
-int dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-       struct file *file = vmf->vma->vm_file;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
-       void *entry, **slot;
-       pgoff_t index = vmf->pgoff;
-
-       spin_lock_irq(&mapping->tree_lock);
-       entry = get_unlocked_mapping_entry(mapping, index, &slot);
-       if (!entry || !radix_tree_exceptional_entry(entry)) {
-               if (entry)
-                       put_unlocked_mapping_entry(mapping, index, entry);
-               spin_unlock_irq(&mapping->tree_lock);
-               trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
-               return VM_FAULT_NOPAGE;
-       }
-       radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
-       entry = lock_slot(mapping, slot);
-       spin_unlock_irq(&mapping->tree_lock);
-       /*
-        * If we race with somebody updating the PTE and finish_mkwrite_fault()
-        * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
-        * the fault in either case.
-        */
-       finish_mkwrite_fault(vmf);
-       put_locked_mapping_entry(mapping, index, entry);
-       trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
-       return VM_FAULT_NOPAGE;
+       if (vmf->flags & FAULT_FLAG_WRITE)
+               return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+       else
+               return vm_insert_mixed(vma, vaddr, pfn);
  }
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
  
  /*
- * The user has performed a load from a hole in the file.  Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files.  We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
+ * The user has performed a load from a hole in the file.  Allocating a new
+ * page in the file would cause excessive storage usage for workloads with
+ * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
+ * If this page is ever written to we will re-fault and change the mapping to
+ * point to real DAX storage instead.
   */
-static int dax_load_hole(struct address_space *mapping, void **entry,
+static int dax_load_hole(struct address_space *mapping, void *entry,
                          struct vm_fault *vmf)
  {
         struct inode *inode = mapping->host;
-       struct page *page;
-       int ret;
-
-       /* Hole page already exists? Return it...  */
-       if (!radix_tree_exceptional_entry(*entry)) {
-               page = *entry;
-               goto finish_fault;
-       }
+       unsigned long vaddr = vmf->address;
+       int ret = VM_FAULT_NOPAGE;
+       struct page *zero_page;
+       void *entry2;
  
-       /* This will replace locked radix tree entry with a hole page */
-       page = find_or_create_page(mapping, vmf->pgoff,
-                                  vmf->gfp_mask | __GFP_ZERO);
-       if (!page) {
+       zero_page = ZERO_PAGE(0);
+       if (unlikely(!zero_page)) {
                 ret = VM_FAULT_OOM;
                 goto out;
         }
  
-finish_fault:
-       vmf->page = page;
-       ret = finish_fault(vmf);
-       vmf->page = NULL;
-       *entry = page;
-       if (!ret) {
-               /* Grab reference for PTE that is now referencing the page */
-               get_page(page);
-               ret = VM_FAULT_NOPAGE;
+       entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+                       RADIX_DAX_ZERO_PAGE);
+       if (IS_ERR(entry2)) {
+               ret = VM_FAULT_SIGBUS;
+               goto out;
         }
+
+       vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
  out:
         trace_dax_load_hole(inode, vmf, ret);
         return ret;
@@ -1223,7 +1134,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                         major = VM_FAULT_MAJOR;
                 }
                 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-                               sector, PAGE_SIZE, &entry, vmf->vma, vmf);
+                               sector, PAGE_SIZE, entry, vmf->vma, vmf);
                 /* -EBUSY is fine, somebody else faulted on the same PTE */
                 if (error == -EBUSY)
                         error = 0;
@@ -1231,7 +1142,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
         case IOMAP_UNWRITTEN:
         case IOMAP_HOLE:
                 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
-                       vmf_ret = dax_load_hole(mapping, &entry, vmf);
+                       vmf_ret = dax_load_hole(mapping, entry, vmf);
                         goto finish_iomap;
                 }
                 /*FALLTHRU*/
@@ -1258,7 +1169,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
         }
   unlock_entry:
-       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+       put_locked_mapping_entry(mapping, vmf->pgoff);
   out:
         trace_dax_pte_fault_done(inode, vmf, vmf_ret);
         return vmf_ret;
@@ -1272,7 +1183,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
  #define PG_PMD_COLOUR  ((PMD_SIZE >> PAGE_SHIFT) - 1)
  
  static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-               loff_t pos, void **entryp)
+               loff_t pos, void *entry)
  {
         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
         const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1303,11 +1214,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
                 goto unlock_fallback;
         dax_read_unlock(id);
  
-       ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
+       ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
                         RADIX_DAX_PMD);
         if (IS_ERR(ret))
                 goto fallback;
-       *entryp = ret;
  
         trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
         return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1321,7 +1231,7 @@ fallback:
  }
  
  static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
-               void **entryp)
+               void *entry)
  {
         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
         unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1336,11 +1246,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
         if (unlikely(!zero_page))
                 goto fallback;
  
-       ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
-                       RADIX_DAX_PMD | RADIX_DAX_HZP);
+       ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+                       RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
         if (IS_ERR(ret))
                 goto fallback;
-       *entryp = ret;
  
         ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
         if (!pmd_none(*(vmf->pmd))) {
@@ -1416,10 +1325,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
                 goto fallback;
  
         /*
-        * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
-        * PMD or a HZP entry.  If it can't (because a 4k page is already in
-        * the tree, for instance), it will return -EEXIST and we just fall
-        * back to 4k entries.
+        * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
+        * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
+        * is already in the tree, for instance), it will return -EEXIST and
+        * we just fall back to 4k entries.
          */
         entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
         if (IS_ERR(entry))
@@ -1452,13 +1361,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  
         switch (iomap.type) {
         case IOMAP_MAPPED:
-               result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
+               result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
                 break;
         case IOMAP_UNWRITTEN:
         case IOMAP_HOLE:
                 if (WARN_ON_ONCE(write))
                         break;
-               result = dax_pmd_load_hole(vmf, &iomap, &entry);
+               result = dax_pmd_load_hole(vmf, &iomap, entry);
                 break;
         default:
                 WARN_ON_ONCE(1);
@@ -1481,7 +1390,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
                                 &iomap);
         }
   unlock_entry:
-       put_locked_mapping_entry(mapping, pgoff, entry);
+       put_locked_mapping_entry(mapping, pgoff);
   fallback:
         if (result == VM_FAULT_FALLBACK) {
                 split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c

index d34d32bdc944add56a4375827e044c8482665c8c..ff3a3636a5cab588fb05f0644e1892ce6e236c6f 100644 (file)
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf)
         return ret;
  }
  
-static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-       struct inode *inode = file_inode(vmf->vma->vm_file);
-       struct ext2_inode_info *ei = EXT2_I(inode);
-       loff_t size;
-       int ret;
-
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vmf->vma->vm_file);
-       down_read(&ei->dax_sem);
-
-       /* check that the faulting page hasn't raced with truncate */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (vmf->pgoff >= size)
-               ret = VM_FAULT_SIGBUS;
-       else
-               ret = dax_pfn_mkwrite(vmf);
-
-       up_read(&ei->dax_sem);
-       sb_end_pagefault(inode->i_sb);
-       return ret;
-}
-
  static const struct vm_operations_struct ext2_dax_vm_ops = {
         .fault          = ext2_dax_fault,
         /*
@@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
          * will always fail and fail back to regular faults.
          */
         .page_mkwrite   = ext2_dax_fault,
-       .pfn_mkwrite    = ext2_dax_pfn_mkwrite,
+       .pfn_mkwrite    = ext2_dax_fault,
  };
  
  static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 0d7cf0cc9b87562bbc52b17224794d9147208910..f28ac999dfbaa23a86da1b8e704ed4f63ca2fcd7 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -311,41 +311,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
         return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
  }
  
-/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
- * handler we check for races agaist truncate. Note that since we cycle through
- * i_mmap_sem, we are sure that also any hole punching that began before we
- * were called is finished by now and so if it included part of the file we
- * are working on, our pte will get unmapped and the check for pte_same() in
- * wp_pfn_shared() fails. Thus fault gets retried and things work out as
- * desired.
- */
-static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-       struct inode *inode = file_inode(vmf->vma->vm_file);
-       struct super_block *sb = inode->i_sb;
-       loff_t size;
-       int ret;
-
-       sb_start_pagefault(sb);
-       file_update_time(vmf->vma->vm_file);
-       down_read(&EXT4_I(inode)->i_mmap_sem);
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (vmf->pgoff >= size)
-               ret = VM_FAULT_SIGBUS;
-       else
-               ret = dax_pfn_mkwrite(vmf);
-       up_read(&EXT4_I(inode)->i_mmap_sem);
-       sb_end_pagefault(sb);
-
-       return ret;
-}
-
  static const struct vm_operations_struct ext4_dax_vm_ops = {
         .fault          = ext4_dax_fault,
         .huge_fault     = ext4_dax_huge_fault,
         .page_mkwrite   = ext4_dax_fault,
-       .pfn_mkwrite    = ext4_dax_pfn_mkwrite,
+       .pfn_mkwrite    = ext4_dax_fault,
  };
  #else
  #define ext4_dax_vm_ops        ext4_file_vm_ops
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index c4893e226fd8e7ddebf8ee22aed420d5d122d473..62db8ffa83b9185408f65ab8541d70d0587ea9d0 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1130,7 +1130,7 @@ xfs_filemap_pfn_mkwrite(
         if (vmf->pgoff >= size)
                 ret = VM_FAULT_SIGBUS;
         else if (IS_DAX(inode))
-               ret = dax_pfn_mkwrite(vmf);
+               ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
         xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
         sb_end_pagefault(inode->i_sb);
         return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h

index df97b7af7e2c7263c4ae9fb1608b19aad1ae02ba..b3518559f0da0217ea834cf974f277bd446ce9e6 100644 (file)
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -91,18 +91,17 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
  
  /*
   * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a huge zero
- * page (HZP) or an empty entry that is just used for locking.  In total four
- * special bits.
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking.  In total four special bits.
   *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
- * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
   * block allocation.
   */
  #define RADIX_DAX_SHIFT        (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
  #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
  #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
  #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
  
  static inline unsigned long dax_radix_sector(void *entry)
@@ -153,7 +152,6 @@ static inline unsigned int dax_radix_order(void *entry)
         return 0;
  }
  #endif
-int dax_pfn_mkwrite(struct vm_fault *vmf);
  
  static inline bool dax_mapping(struct address_space *mapping)
  {
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h

index 08bb3ed18dcc213dbbdcfad0832a83a8fbb2ed4e..fbc4a06f7310a0d04fdfbd702d94663edcc04b56 100644 (file)
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -190,8 +190,6 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
  
  DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
  DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
-DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
-DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
  DEFINE_PTE_FAULT_EVENT(dax_load_hole);
  
  TRACE_EVENT(dax_insert_mapping,
author	Ross Zwisler <ross.zwisler@linux.intel.com>
	Wed, 6 Sep 2017 23:18:43 +0000 (16:18 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 7 Sep 2017 00:27:24 +0000 (17:27 -0700)
Documentation/filesystems/dax.txt		patch \| blob \| history
fs/dax.c		patch \| blob \| history
fs/ext2/file.c		patch \| blob \| history
fs/ext4/file.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
include/linux/dax.h		patch \| blob \| history
include/trace/events/fs_dax.h		patch \| blob \| history