Merge branch 'for-4.18/dax' into libnvdimm-for-next
authorDan Williams <dan.j.williams@intel.com>
Fri, 8 Jun 2018 22:16:40 +0000 (15:16 -0700)
committerDan Williams <dan.j.williams@intel.com>
Fri, 8 Jun 2018 22:16:40 +0000 (15:16 -0700)
1  2 
drivers/dax/super.c
drivers/nvdimm/pmem.c
fs/xfs/xfs_file.c
include/linux/mm.h
mm/Kconfig
mm/gup.c

diff --combined drivers/dax/super.c
index c2c46f96b18c9d05833ee4d9985a2bfa2aa2c5f2,bf8bfaf5596f30c015dab825222038bf673f314d..60d01b5d2a6710e7b43bb340db1bdc24d64d2775
@@@ -86,6 -86,7 +86,7 @@@ int __bdev_dax_supported(struct super_b
  {
        struct block_device *bdev = sb->s_bdev;
        struct dax_device *dax_dev;
+       bool dax_enabled = false;
        pgoff_t pgoff;
        int err, id;
        void *kaddr;
                 * on being able to do (page_address(pfn_to_page())).
                 */
                WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
+               dax_enabled = true;
        } else if (pfn_t_devmap(pfn)) {
-               /* pass */;
-       } else {
+               struct dev_pagemap *pgmap;
+               pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
+               if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
+                       dax_enabled = true;
+               put_dev_pagemap(pgmap);
+       }
+       if (!dax_enabled) {
                pr_debug("VFS (%s): error: dax support not enabled\n",
                                sb->s_id);
                return -EOPNOTSUPP;
        }
        return 0;
  }
  EXPORT_SYMBOL_GPL(__bdev_dax_supported);
@@@ -182,7 -190,8 +190,7 @@@ static ssize_t write_cache_show(struct 
        if (!dax_dev)
                return -ENXIO;
  
 -      rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE,
 -                              &dax_dev->flags));
 +      rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev));
        put_dax(dax_dev);
        return rc;
  }
@@@ -200,8 -209,10 +208,8 @@@ static ssize_t write_cache_store(struc
  
        if (rc)
                len = rc;
 -      else if (write_cache)
 -              set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
        else
 -              clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 +              dax_write_cache(dax_dev, write_cache);
  
        put_dax(dax_dev);
        return len;
@@@ -283,7 -294,7 +291,7 @@@ EXPORT_SYMBOL_GPL(dax_copy_from_iter)
  void arch_wb_cache_pmem(void *addr, size_t size);
  void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
  {
 -      if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
 +      if (unlikely(!dax_write_cache_enabled(dax_dev)))
                return;
  
        arch_wb_cache_pmem(addr, size);
diff --combined drivers/nvdimm/pmem.c
index 97b4c39a9267a5d0abc17342780d025c26bb5280,06b41ec9f1b3747624d1f4d4eb4920efcc376d06..bf2dd2a4a5e60050f092ee3c2f14b01b76781b53
@@@ -164,6 -164,11 +164,6 @@@ static blk_status_t pmem_do_bvec(struc
        return rc;
  }
  
 -/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
 -#ifndef REQ_FLUSH
 -#define REQ_FLUSH REQ_PREFLUSH
 -#endif
 -
  static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
  {
        blk_status_t rc = 0;
        struct pmem_device *pmem = q->queuedata;
        struct nd_region *nd_region = to_region(pmem);
  
 -      if (bio->bi_opf & REQ_FLUSH)
 +      if (bio->bi_opf & REQ_PREFLUSH)
                nvdimm_flush(nd_region);
  
        do_acct = nd_iostat_start(bio, &start);
@@@ -289,12 -294,33 +289,33 @@@ static void pmem_release_disk(void *__p
        put_disk(pmem->disk);
  }
  
+ static void pmem_release_pgmap_ops(void *__pgmap)
+ {
+       dev_pagemap_put_ops();
+ }
+ static void fsdax_pagefree(struct page *page, void *data)
+ {
+       wake_up_var(&page->_refcount);
+ }
+ static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
+ {
+       dev_pagemap_get_ops();
+       if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
+               return -ENOMEM;
+       pgmap->type = MEMORY_DEVICE_FS_DAX;
+       pgmap->page_free = fsdax_pagefree;
+       return 0;
+ }
  static int pmem_attach_disk(struct device *dev,
                struct nd_namespace_common *ndns)
  {
        struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
        struct nd_region *nd_region = to_nd_region(dev->parent);
 -      int nid = dev_to_node(dev), fua, wbc;
 +      int nid = dev_to_node(dev), fua;
        struct resource *res = &nsio->res;
        struct resource bb_res;
        struct nd_pfn *nd_pfn = NULL;
                dev_warn(dev, "unable to guarantee persistence of writes\n");
                fua = 0;
        }
 -      wbc = nvdimm_has_cache(nd_region);
  
        if (!devm_request_mem_region(dev, res->start, resource_size(res),
                                dev_name(&ndns->dev))) {
        pmem->pfn_flags = PFN_DEV;
        pmem->pgmap.ref = &q->q_usage_counter;
        if (is_nd_pfn(dev)) {
+               if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+                       return -ENOMEM;
                addr = devm_memremap_pages(dev, &pmem->pgmap);
                pfn_sb = nd_pfn->pfn_sb;
                pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
        } else if (pmem_should_map_pages(dev)) {
                memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
                pmem->pgmap.altmap_valid = false;
+               if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+                       return -ENOMEM;
                addr = devm_memremap_pages(dev, &pmem->pgmap);
                pmem->pfn_flags |= PFN_MAP;
                memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
                return PTR_ERR(addr);
        pmem->virt_addr = addr;
  
 -      blk_queue_write_cache(q, wbc, fua);
 +      blk_queue_write_cache(q, true, fua);
        blk_queue_make_request(q, pmem_make_request);
        blk_queue_physical_block_size(q, PAGE_SIZE);
        blk_queue_logical_block_size(q, pmem_sector_size(ndns));
                put_disk(disk);
                return -ENOMEM;
        }
 -      dax_write_cache(dax_dev, wbc);
 +      dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
        pmem->dax_dev = dax_dev;
  
        gendev = disk_to_dev(disk);
diff --combined fs/xfs/xfs_file.c
index e70fb8cceceaa5d2333573e49460beba75629815,f5695dc314f109a82ee856087ed36dcc440e0c36..19b0c3e0e232203ffb2c8f7c60d640424542b2b7
@@@ -312,7 -312,7 +312,7 @@@ restart
        if (error <= 0)
                return error;
  
-       error = xfs_break_layouts(inode, iolock);
+       error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
        if (error)
                return error;
  
@@@ -718,6 -718,69 +718,69 @@@ buffered
        return ret;
  }
  
+ static void
+ xfs_wait_dax_page(
+       struct inode            *inode,
+       bool                    *did_unlock)
+ {
+       struct xfs_inode        *ip = XFS_I(inode);
+       *did_unlock = true;
+       xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+       schedule();
+       xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ }
+ static int
+ xfs_break_dax_layouts(
+       struct inode            *inode,
+       uint                    iolock,
+       bool                    *did_unlock)
+ {
+       struct page             *page;
+       ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+       page = dax_layout_busy_page(inode->i_mapping);
+       if (!page)
+               return 0;
+       return ___wait_var_event(&page->_refcount,
+                       atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+                       0, 0, xfs_wait_dax_page(inode, did_unlock));
+ }
+ int
+ xfs_break_layouts(
+       struct inode            *inode,
+       uint                    *iolock,
+       enum layout_break_reason reason)
+ {
+       bool                    retry;
+       int                     error;
+       ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+       do {
+               retry = false;
+               switch (reason) {
+               case BREAK_UNMAP:
+                       error = xfs_break_dax_layouts(inode, *iolock, &retry);
+                       if (error || retry)
+                               break;
+                       /* fall through */
+               case BREAK_WRITE:
+                       error = xfs_break_leased_layouts(inode, iolock, &retry);
+                       break;
+               default:
+                       WARN_ON_ONCE(1);
+                       error = -EINVAL;
+               }
+       } while (error == 0 && retry);
+       return error;
+ }
  #define       XFS_FALLOC_FL_SUPPORTED                                         \
                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
                 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
@@@ -734,7 -797,7 +797,7 @@@ xfs_file_fallocate
        struct xfs_inode        *ip = XFS_I(inode);
        long                    error;
        enum xfs_prealloc_flags flags = 0;
-       uint                    iolock = XFS_IOLOCK_EXCL;
+       uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
        loff_t                  new_size = 0;
        bool                    do_file_insert = false;
  
                return -EOPNOTSUPP;
  
        xfs_ilock(ip, iolock);
-       error = xfs_break_layouts(inode, &iolock);
+       error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
        if (error)
                goto out_unlock;
  
-       xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-       iolock |= XFS_MMAPLOCK_EXCL;
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                error = xfs_free_file_space(ip, offset, len);
                if (error)
                if (error)
                        goto out_unlock;
        } else if (mode & FALLOC_FL_INSERT_RANGE) {
 -              unsigned int blksize_mask = i_blocksize(inode) - 1;
 +              unsigned int    blksize_mask = i_blocksize(inode) - 1;
 +              loff_t          isize = i_size_read(inode);
  
 -              new_size = i_size_read(inode) + len;
                if (offset & blksize_mask || len & blksize_mask) {
                        error = -EINVAL;
                        goto out_unlock;
                }
  
 -              /* check the new inode size does not wrap through zero */
 -              if (new_size > inode->i_sb->s_maxbytes) {
 +              /*
 +               * New inode size must not exceed ->s_maxbytes, accounting for
 +               * possible signed overflow.
 +               */
 +              if (inode->i_sb->s_maxbytes - isize < len) {
                        error = -EFBIG;
                        goto out_unlock;
                }
 +              new_size = isize + len;
  
                /* Offset should be less than i_size */
 -              if (offset >= i_size_read(inode)) {
 +              if (offset >= isize) {
                        error = -EINVAL;
                        goto out_unlock;
                }
@@@ -880,18 -936,8 +940,18 @@@ xfs_file_dedupe_range
        struct file     *dst_file,
        u64             dst_loff)
  {
 +      struct inode    *srci = file_inode(src_file);
 +      u64             max_dedupe;
        int             error;
  
 +      /*
 +       * Since we have to read all these pages in to compare them, cut
 +       * it off at MAX_RW_COUNT/2 rounded down to the nearest block.
 +       * That means we won't do more than MAX_RW_COUNT IO per request.
 +       */
 +      max_dedupe = (MAX_RW_COUNT >> 1) & ~(i_blocksize(srci) - 1);
 +      if (len > max_dedupe)
 +              len = max_dedupe;
        error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
                                     len, true);
        if (error)
diff --combined include/linux/mm.h
index 02a616e2f17d0ff653a5fa1b0f9721423266cf59,6e19265ee8f86e50852a73f8e8d1488b376c5309..274d5242bd0d8443124fcee6aec5b15cf637ee99
@@@ -821,27 -821,65 +821,65 @@@ static inline bool is_zone_device_page(
  }
  #endif
  
- #if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
- void put_zone_device_private_or_public_page(struct page *page);
- DECLARE_STATIC_KEY_FALSE(device_private_key);
- #define IS_HMM_ENABLED static_branch_unlikely(&device_private_key)
- static inline bool is_device_private_page(const struct page *page);
- static inline bool is_device_public_page(const struct page *page);
- #else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
- static inline void put_zone_device_private_or_public_page(struct page *page)
+ #ifdef CONFIG_DEV_PAGEMAP_OPS
+ void dev_pagemap_get_ops(void);
+ void dev_pagemap_put_ops(void);
+ void __put_devmap_managed_page(struct page *page);
+ DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
+ static inline bool put_devmap_managed_page(struct page *page)
+ {
+       if (!static_branch_unlikely(&devmap_managed_key))
+               return false;
+       if (!is_zone_device_page(page))
+               return false;
+       switch (page->pgmap->type) {
+       case MEMORY_DEVICE_PRIVATE:
+       case MEMORY_DEVICE_PUBLIC:
+       case MEMORY_DEVICE_FS_DAX:
+               __put_devmap_managed_page(page);
+               return true;
+       default:
+               break;
+       }
+       return false;
+ }
+ static inline bool is_device_private_page(const struct page *page)
+ {
+       return is_zone_device_page(page) &&
+               page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+ }
+ static inline bool is_device_public_page(const struct page *page)
+ {
+       return is_zone_device_page(page) &&
+               page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+ }
+ #else /* CONFIG_DEV_PAGEMAP_OPS */
+ static inline void dev_pagemap_get_ops(void)
  {
  }
- #define IS_HMM_ENABLED 0
+ static inline void dev_pagemap_put_ops(void)
+ {
+ }
+ static inline bool put_devmap_managed_page(struct page *page)
+ {
+       return false;
+ }
  static inline bool is_device_private_page(const struct page *page)
  {
        return false;
  }
  static inline bool is_device_public_page(const struct page *page)
  {
        return false;
  }
- #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+ #endif /* CONFIG_DEV_PAGEMAP_OPS */
  
  static inline void get_page(struct page *page)
  {
@@@ -859,16 -897,13 +897,13 @@@ static inline void put_page(struct pag
        page = compound_head(page);
  
        /*
-        * For private device pages we need to catch refcount transition from
-        * 2 to 1, when refcount reach one it means the private device page is
-        * free and we need to inform the device driver through callback. See
+        * For devmap managed pages we need to catch refcount transition from
+        * 2 to 1, when refcount reach one it means the page is free and we
+        * need to inform the device driver through callback. See
         * include/linux/memremap.h and HMM for details.
         */
-       if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) ||
-           unlikely(is_device_public_page(page)))) {
-               put_zone_device_private_or_public_page(page);
+       if (put_devmap_managed_page(page))
                return;
-       }
  
        if (put_page_testzero(page))
                __put_page(page);
@@@ -2109,6 -2144,7 +2144,6 @@@ extern void setup_per_cpu_pageset(void)
  
  extern void zone_pcp_update(struct zone *zone);
  extern void zone_pcp_reset(struct zone *zone);
 -extern void setup_zone_pageset(struct zone *zone);
  
  /* page_alloc.c */
  extern int min_free_kbytes;
@@@ -2465,13 -2501,6 +2500,13 @@@ static inline vm_fault_t vmf_insert_pfn
        return VM_FAULT_NOPAGE;
  }
  
 +static inline vm_fault_t vmf_error(int err)
 +{
 +      if (err == -ENOMEM)
 +              return VM_FAULT_OOM;
 +      return VM_FAULT_SIGBUS;
 +}
 +
  struct page *follow_page_mask(struct vm_area_struct *vma,
                              unsigned long address, unsigned int foll_flags,
                              unsigned int *page_mask);
@@@ -2499,7 -2528,6 +2534,7 @@@ static inline struct page *follow_page(
  #define FOLL_MLOCK    0x1000  /* lock present pages */
  #define FOLL_REMOTE   0x2000  /* we are working on non-current tsk/mm */
  #define FOLL_COW      0x4000  /* internal GUP flag */
 +#define FOLL_ANON     0x8000  /* don't do file mappings */
  
  static inline int vm_fault_to_errno(int vm_fault, int foll_flags)
  {
diff --combined mm/Kconfig
index e14c01513bfd0777baec40a0a3555571c1621b11,bf9d6366bced21781767fb495c6b1a7c73faa3b0..5f39bca5d82b005d9cb12ba7d5df10fce2b2881d
@@@ -636,7 -636,6 +636,7 @@@ config DEFERRED_STRUCT_PAGE_INI
        default n
        depends on NO_BOOTMEM
        depends on !FLATMEM
 +      depends on !NEED_PER_CPU_KM
        help
          Ordinarily all struct pages are initialised during early boot in a
          single thread. On very large machines this can take a considerable
@@@ -693,6 -692,9 +693,9 @@@ config ARCH_HAS_HM
  config MIGRATE_VMA_HELPER
        bool
  
+ config DEV_PAGEMAP_OPS
+       bool
  config HMM
        bool
        select MIGRATE_VMA_HELPER
@@@ -713,6 -715,7 +716,7 @@@ config DEVICE_PRIVAT
        bool "Unaddressable device memory (GPU memory, ...)"
        depends on ARCH_HAS_HMM
        select HMM
+       select DEV_PAGEMAP_OPS
  
        help
          Allows creation of struct pages to represent unaddressable device
@@@ -723,6 -726,7 +727,7 @@@ config DEVICE_PUBLI
        bool "Addressable device memory (like GPU memory)"
        depends on ARCH_HAS_HMM
        select HMM
+       select DEV_PAGEMAP_OPS
  
        help
          Allows creation of struct pages to represent addressable device
diff --combined mm/gup.c
index 541904a7c60fd596b1f5bc432fc7932bc005fadf,84dd2063ca3d96bd4140aae658aceaa3f447d4bb..3d8472d48a0b88366e6e60e3c17fee1fe0dae7d3
+++ b/mm/gup.c
@@@ -544,9 -544,6 +544,9 @@@ static int check_vma_flags(struct vm_ar
        if (vm_flags & (VM_IO | VM_PFNMAP))
                return -EFAULT;
  
 +      if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
 +              return -EFAULT;
 +
        if (write) {
                if (!(vm_flags & VM_WRITE)) {
                        if (!(gup_flags & FOLL_FORCE))
@@@ -1459,32 -1456,48 +1459,48 @@@ static int __gup_device_huge(unsigned l
        return 1;
  }
  
- static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, struct page **pages, int *nr)
  {
        unsigned long fault_pfn;
+       int nr_start = *nr;
+       fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+       if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+               return 0;
  
-       fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+       if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+               undo_dev_pagemap(nr, nr_start, pages);
+               return 0;
+       }
+       return 1;
  }
  
- static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, struct page **pages, int *nr)
  {
        unsigned long fault_pfn;
+       int nr_start = *nr;
+       fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+       if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+               return 0;
  
-       fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-       return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+       if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+               undo_dev_pagemap(nr, nr_start, pages);
+               return 0;
+       }
+       return 1;
  }
  #else
- static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, struct page **pages, int *nr)
  {
        BUILD_BUG();
        return 0;
  }
  
- static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
                unsigned long end, struct page **pages, int *nr)
  {
        BUILD_BUG();
@@@ -1502,7 -1515,7 +1518,7 @@@ static int gup_huge_pmd(pmd_t orig, pmd
                return 0;
  
        if (pmd_devmap(orig))
-               return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+               return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
  
        refs = 0;
        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@@ -1540,7 -1553,7 +1556,7 @@@ static int gup_huge_pud(pud_t orig, pud
                return 0;
  
        if (pud_devmap(orig))
-               return __gup_device_huge_pud(orig, addr, end, pages, nr);
+               return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
  
        refs = 0;
        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);