xarray: Replace exceptional entries
[sfrench/cifs-2.6.git] / fs / dax.c
index 897b51e41d8f0f741c7514bc3ddd8c5bb4277b4a..ebcec36335eb07ab6347c5b94be2727a5bd926b9 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -59,56 +59,57 @@ static int __init init_dax_wait_table(void)
 fs_initcall(init_dax_wait_table);
 
 /*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking.  In total four special bits.
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
+ * for pages.  We use one bit for locking, one bit for the entry size (PMD)
+ * and two more to tell us if the entry is a zero page or an empty entry that
+ * is just used for locking.  In total four special bits.
  *
  * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
  * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  * block allocation.
  */
-#define RADIX_DAX_SHIFT                (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK   (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD          (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE    (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY                (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+#define DAX_SHIFT      (4)
+#define DAX_LOCKED     (1UL << 0)
+#define DAX_PMD                (1UL << 1)
+#define DAX_ZERO_PAGE  (1UL << 2)
+#define DAX_EMPTY      (1UL << 3)
 
 static unsigned long dax_radix_pfn(void *entry)
 {
-       return (unsigned long)entry >> RADIX_DAX_SHIFT;
+       return xa_to_value(entry) >> DAX_SHIFT;
 }
 
 static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
 {
-       return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
-                       (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
+       return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) |
+                       DAX_LOCKED);
 }
 
 static unsigned int dax_radix_order(void *entry)
 {
-       if ((unsigned long)entry & RADIX_DAX_PMD)
+       if (xa_to_value(entry) & DAX_PMD)
                return PMD_SHIFT - PAGE_SHIFT;
        return 0;
 }
 
 static int dax_is_pmd_entry(void *entry)
 {
-       return (unsigned long)entry & RADIX_DAX_PMD;
+       return xa_to_value(entry) & DAX_PMD;
 }
 
 static int dax_is_pte_entry(void *entry)
 {
-       return !((unsigned long)entry & RADIX_DAX_PMD);
+       return !(xa_to_value(entry) & DAX_PMD);
 }
 
 static int dax_is_zero_entry(void *entry)
 {
-       return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
+       return xa_to_value(entry) & DAX_ZERO_PAGE;
 }
 
 static int dax_is_empty_entry(void *entry)
 {
-       return (unsigned long)entry & RADIX_DAX_EMPTY;
+       return xa_to_value(entry) & DAX_EMPTY;
 }
 
 /*
@@ -186,9 +187,9 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
  */
 static inline int slot_locked(struct address_space *mapping, void **slot)
 {
-       unsigned long entry = (unsigned long)
-               radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-       return entry & RADIX_DAX_ENTRY_LOCK;
+       unsigned long entry = xa_to_value(
+               radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
+       return entry & DAX_LOCKED;
 }
 
 /*
@@ -196,12 +197,11 @@ static inline int slot_locked(struct address_space *mapping, void **slot)
  */
 static inline void *lock_slot(struct address_space *mapping, void **slot)
 {
-       unsigned long entry = (unsigned long)
-               radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
-       entry |= RADIX_DAX_ENTRY_LOCK;
-       radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
-       return (void *)entry;
+       unsigned long v = xa_to_value(
+               radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
+       void *entry = xa_mk_value(v | DAX_LOCKED);
+       radix_tree_replace_slot(&mapping->i_pages, slot, entry);
+       return entry;
 }
 
 /*
@@ -209,25 +209,24 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
  */
 static inline void *unlock_slot(struct address_space *mapping, void **slot)
 {
-       unsigned long entry = (unsigned long)
-               radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
-       entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
-       radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
-       return (void *)entry;
+       unsigned long v = xa_to_value(
+               radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
+       void *entry = xa_mk_value(v & ~DAX_LOCKED);
+       radix_tree_replace_slot(&mapping->i_pages, slot, entry);
+       return entry;
 }
 
 /*
  * Lookup entry in radix tree, wait for it to become unlocked if it is
- * exceptional entry and return it. The caller must call
+ * a DAX entry and return it. The caller must call
  * put_unlocked_mapping_entry() when he decided not to lock the entry or
  * put_locked_mapping_entry() when he locked the entry and now wants to
  * unlock it.
  *
  * Must be called with the i_pages lock held.
  */
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
-                                       pgoff_t index, void ***slotp)
+static void *__get_unlocked_mapping_entry(struct address_space *mapping,
+               pgoff_t index, void ***slotp, bool (*wait_fn)(void))
 {
        void *entry, **slot;
        struct wait_exceptional_entry_queue ewait;
@@ -237,10 +236,12 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
        ewait.wait.func = wake_exceptional_entry_func;
 
        for (;;) {
+               bool revalidate;
+
                entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
                                          &slot);
                if (!entry ||
-                   WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
+                   WARN_ON_ONCE(!xa_is_value(entry)) ||
                    !slot_locked(mapping, slot)) {
                        if (slotp)
                                *slotp = slot;
@@ -251,20 +252,37 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
                prepare_to_wait_exclusive(wq, &ewait.wait,
                                          TASK_UNINTERRUPTIBLE);
                xa_unlock_irq(&mapping->i_pages);
-               schedule();
+               revalidate = wait_fn();
                finish_wait(wq, &ewait.wait);
                xa_lock_irq(&mapping->i_pages);
+               if (revalidate)
+                       return ERR_PTR(-EAGAIN);
        }
 }
 
-static void dax_unlock_mapping_entry(struct address_space *mapping,
-                                    pgoff_t index)
+static bool entry_wait(void)
+{
+       schedule();
+       /*
+        * Never return an ERR_PTR() from
+        * __get_unlocked_mapping_entry(), just keep looping.
+        */
+       return false;
+}
+
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+               pgoff_t index, void ***slotp)
+{
+       return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
+}
+
+static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
        void *entry, **slot;
 
        xa_lock_irq(&mapping->i_pages);
        entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
-       if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
+       if (WARN_ON_ONCE(!entry || !xa_is_value(entry) ||
                         !slot_locked(mapping, slot))) {
                xa_unlock_irq(&mapping->i_pages);
                return;
@@ -277,7 +295,7 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
 static void put_locked_mapping_entry(struct address_space *mapping,
                pgoff_t index)
 {
-       dax_unlock_mapping_entry(mapping, index);
+       unlock_mapping_entry(mapping, index);
 }
 
 /*
@@ -319,18 +337,27 @@ static unsigned long dax_radix_end_pfn(void *entry)
        for (pfn = dax_radix_pfn(entry); \
                        pfn < dax_radix_end_pfn(entry); pfn++)
 
-static void dax_associate_entry(void *entry, struct address_space *mapping)
+/*
+ * TODO: for reflink+dax we need a way to associate a single page with
+ * multiple address_space instances at different linear_page_index()
+ * offsets.
+ */
+static void dax_associate_entry(void *entry, struct address_space *mapping,
+               struct vm_area_struct *vma, unsigned long address)
 {
-       unsigned long pfn;
+       unsigned long size = dax_entry_size(entry), pfn, index;
+       int i = 0;
 
        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
                return;
 
+       index = linear_page_index(vma, address & ~(size - 1));
        for_each_mapped_pfn(entry, pfn) {
                struct page *page = pfn_to_page(pfn);
 
                WARN_ON_ONCE(page->mapping);
                page->mapping = mapping;
+               page->index = index + i++;
        }
 }
 
@@ -348,6 +375,7 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
                WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
                WARN_ON_ONCE(page->mapping && page->mapping != mapping);
                page->mapping = NULL;
+               page->index = 0;
        }
 }
 
@@ -364,13 +392,90 @@ static struct page *dax_busy_page(void *entry)
        return NULL;
 }
 
+static bool entry_wait_revalidate(void)
+{
+       rcu_read_unlock();
+       schedule();
+       rcu_read_lock();
+
+       /*
+        * Tell __get_unlocked_mapping_entry() to take a break, we need
+        * to revalidate page->mapping after dropping locks
+        */
+       return true;
+}
+
+bool dax_lock_mapping_entry(struct page *page)
+{
+       pgoff_t index;
+       struct inode *inode;
+       bool did_lock = false;
+       void *entry = NULL, **slot;
+       struct address_space *mapping;
+
+       rcu_read_lock();
+       for (;;) {
+               mapping = READ_ONCE(page->mapping);
+
+               if (!dax_mapping(mapping))
+                       break;
+
+               /*
+                * In the device-dax case there's no need to lock, a
+                * struct dev_pagemap pin is sufficient to keep the
+                * inode alive, and we assume we have dev_pagemap pin
+                * otherwise we would not have a valid pfn_to_page()
+                * translation.
+                */
+               inode = mapping->host;
+               if (S_ISCHR(inode->i_mode)) {
+                       did_lock = true;
+                       break;
+               }
+
+               xa_lock_irq(&mapping->i_pages);
+               if (mapping != page->mapping) {
+                       xa_unlock_irq(&mapping->i_pages);
+                       continue;
+               }
+               index = page->index;
+
+               entry = __get_unlocked_mapping_entry(mapping, index, &slot,
+                               entry_wait_revalidate);
+               if (!entry) {
+                       xa_unlock_irq(&mapping->i_pages);
+                       break;
+               } else if (IS_ERR(entry)) {
+                       WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
+                       continue;
+               }
+               lock_slot(mapping, slot);
+               did_lock = true;
+               xa_unlock_irq(&mapping->i_pages);
+               break;
+       }
+       rcu_read_unlock();
+
+       return did_lock;
+}
+
+void dax_unlock_mapping_entry(struct page *page)
+{
+       struct address_space *mapping = page->mapping;
+       struct inode *inode = mapping->host;
+
+       if (S_ISCHR(inode->i_mode))
+               return;
+
+       unlock_mapping_entry(mapping, page->index);
+}
+
 /*
- * Find radix tree entry at given index. If it points to an exceptional entry,
- * return it with the radix tree entry locked. If the radix tree doesn't
- * contain given index, create an empty exceptional entry for the index and
- * return with it locked.
+ * Find radix tree entry at given index. If it is a DAX entry, return it
+ * with the radix tree entry locked. If the radix tree doesn't contain the
+ * given index, create an empty entry for the index and return with it locked.
  *
- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
  * either return that locked entry or will return an error.  This error will
  * happen if there are any 4k entries within the 2MiB range that we are
  * requesting.
@@ -400,13 +505,13 @@ restart:
        xa_lock_irq(&mapping->i_pages);
        entry = get_unlocked_mapping_entry(mapping, index, &slot);
 
-       if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+       if (WARN_ON_ONCE(entry && !xa_is_value(entry))) {
                entry = ERR_PTR(-EIO);
                goto out_unlock;
        }
 
        if (entry) {
-               if (size_flag & RADIX_DAX_PMD) {
+               if (size_flag & DAX_PMD) {
                        if (dax_is_pte_entry(entry)) {
                                put_unlocked_mapping_entry(mapping, index,
                                                entry);
@@ -477,7 +582,7 @@ restart:
                                        true);
                }
 
-               entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
+               entry = dax_radix_locked_entry(0, size_flag | DAX_EMPTY);
 
                err = __radix_tree_insert(&mapping->i_pages, index,
                                dax_radix_order(entry), entry);
@@ -566,8 +671,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
                        if (index >= end)
                                break;
 
-                       if (WARN_ON_ONCE(
-                            !radix_tree_exceptional_entry(pvec_ent)))
+                       if (WARN_ON_ONCE(!xa_is_value(pvec_ent)))
                                continue;
 
                        xa_lock_irq(&mapping->i_pages);
@@ -606,7 +710,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 
        xa_lock_irq(pages);
        entry = get_unlocked_mapping_entry(mapping, index, NULL);
-       if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
+       if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
                goto out;
        if (!trunc &&
            (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
@@ -622,8 +726,8 @@ out:
        return ret;
 }
 /*
- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
- * entry to get unlocked before deleting it.
+ * Delete DAX entry at @index from @mapping.  Wait for it
+ * to be unlocked before deleting it.
  */
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
@@ -633,7 +737,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
         * This gets called from truncate / punch_hole path. As such, the caller
         * must hold locks protecting against concurrent modifications of the
         * radix tree (usually fs-private i_mmap_sem for writing). Since the
-        * caller has seen exceptional entry for this index, we better find it
+        * caller has seen a DAX entry for this index, we better find it
         * at that index as well...
         */
        WARN_ON_ONCE(!ret);
@@ -641,7 +745,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 }
 
 /*
- * Invalidate exceptional DAX entry if it is clean.
+ * Invalidate DAX entry if it is clean.
  */
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index)
@@ -655,7 +759,6 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 {
        void *vto, *kaddr;
        pgoff_t pgoff;
-       pfn_t pfn;
        long rc;
        int id;
 
@@ -664,7 +767,7 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
                return rc;
 
        id = dax_read_lock();
-       rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+       rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
        if (rc < 0) {
                dax_read_unlock(id);
                return rc;
@@ -696,7 +799,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
        if (dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-       if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+       if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
                /* we are replacing a zero page with block mapping */
                if (dax_is_pmd_entry(entry))
                        unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
@@ -709,7 +812,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
        new_entry = dax_radix_locked_entry(pfn, flags);
        if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
                dax_disassociate_entry(entry, mapping, false);
-               dax_associate_entry(new_entry, mapping);
+               dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
        }
 
        if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
@@ -834,13 +937,13 @@ static int dax_writeback_one(struct dax_device *dax_dev,
         * A page got tagged dirty in DAX mapping? Something is seriously
         * wrong.
         */
-       if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+       if (WARN_ON(!xa_is_value(entry)))
                return -EIO;
 
        xa_lock_irq(pages);
        entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
        /* Entry got punched out / reallocated? */
-       if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
+       if (!entry2 || WARN_ON_ONCE(!xa_is_value(entry2)))
                goto put_unlocked;
        /*
         * Entry got reallocated elsewhere? No need to writeback. We have to
@@ -975,7 +1078,6 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
 {
        const sector_t sector = dax_iomap_sector(iomap, pos);
        pgoff_t pgoff;
-       void *kaddr;
        int id, rc;
        long length;
 
@@ -984,7 +1086,7 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
                return rc;
        id = dax_read_lock();
        length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
-                                  &kaddr, pfnp);
+                                  NULL, pfnp);
        if (length < 0) {
                rc = length;
                goto out;
@@ -1015,21 +1117,13 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
 {
        struct inode *inode = mapping->host;
        unsigned long vaddr = vmf->address;
-       vm_fault_t ret = VM_FAULT_NOPAGE;
-       struct page *zero_page;
-       pfn_t pfn;
+       pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
+       vm_fault_t ret;
 
-       zero_page = ZERO_PAGE(0);
-       if (unlikely(!zero_page)) {
-               ret = VM_FAULT_OOM;
-               goto out;
-       }
+       dax_insert_mapping_entry(mapping, vmf, entry, pfn,
+                       DAX_ZERO_PAGE, false);
 
-       pfn = page_to_pfn_t(zero_page);
-       dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
-                       false);
        ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
-out:
        trace_dax_load_hole(inode, vmf, ret);
        return ret;
 }
@@ -1060,15 +1154,13 @@ int __dax_zero_page_range(struct block_device *bdev,
                pgoff_t pgoff;
                long rc, id;
                void *kaddr;
-               pfn_t pfn;
 
                rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
                if (rc)
                        return rc;
 
                id = dax_read_lock();
-               rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
-                               &pfn);
+               rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
                if (rc < 0) {
                        dax_read_unlock(id);
                        return rc;
@@ -1124,7 +1216,6 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                ssize_t map_len;
                pgoff_t pgoff;
                void *kaddr;
-               pfn_t pfn;
 
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
@@ -1136,7 +1227,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                        break;
 
                map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
-                               &kaddr, &pfn);
+                               &kaddr, NULL);
                if (map_len < 0) {
                        ret = map_len;
                        break;
@@ -1421,7 +1512,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 
        pfn = page_to_pfn_t(zero_page);
        ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
-                       RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
+                       DAX_PMD | DAX_ZERO_PAGE, false);
 
        ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
        if (!pmd_none(*(vmf->pmd))) {
@@ -1504,7 +1595,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
         * is already in the tree, for instance), it will return -EEXIST and
         * we just fall back to 4k entries.
         */
-       entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+       entry = grab_mapping_entry(mapping, pgoff, DAX_PMD);
        if (IS_ERR(entry))
                goto fallback;
 
@@ -1542,7 +1633,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                        goto finish_iomap;
 
                entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
-                                               RADIX_DAX_PMD, write && !sync);
+                                               DAX_PMD, write && !sync);
 
                /*
                 * If we are doing synchronous page fault and inode needs fsync,