mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory
authorJérôme Glisse <jglisse@redhat.com>
Fri, 8 Sep 2017 23:11:43 +0000 (16:11 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 9 Sep 2017 01:26:46 +0000 (18:26 -0700)
HMM (heterogeneous memory management) need struct page to support
migration from system main memory to device memory.  Reasons for HMM and
migration to device memory is explained with HMM core patch.

This patch deals with device memory that is un-addressable memory (ie CPU
can not access it).  Hence we do not want those struct page to be manage
like regular memory.  That is why we extend ZONE_DEVICE to support
different types of memory.

A persistent memory type is define for existing user of ZONE_DEVICE and a
new device un-addressable type is added for the un-addressable memory
type.  There is a clear separation between what is expected from each
memory type and existing user of ZONE_DEVICE are un-affected by new
requirement and new use of the un-addressable type.  All specific code
path are protect with test against the memory type.

Because memory is un-addressable we use a new special swap type for when a
page is migrated to device memory (this reduces the number of maximum swap
file).

The main two additions beside memory type to ZONE_DEVICE is two callbacks.
First one, page_free() is call whenever page refcount reach 1 (which
means the page is free as ZONE_DEVICE page never reach a refcount of 0).
This allow device driver to manage its memory and associated struct page.

The second callback page_fault() happens when there is a CPU access to an
address that is back by a device page (which are un-addressable by the
CPU).  This callback is responsible to migrate the page back to system
main memory.  Device driver can not block migration back to system memory,
HMM make sure that such page can not be pin into device memory.

If device is in some error condition and can not migrate memory back then
a CPU page fault to device memory should end with SIGBUS.

[arnd@arndb.de: fix warning]
Link: http://lkml.kernel.org/r/20170823133213.712917-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20170817000548.32038-8-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/proc/task_mmu.c
include/linux/ioport.h
include/linux/memremap.h
include/linux/mm.h
include/linux/swap.h
include/linux/swapops.h
kernel/memremap.c
mm/Kconfig
mm/memory.c
mm/memory_hotplug.c
mm/mprotect.c

index 4b21c4e51ce490830cf5553bbfb174bcd9f568eb..90ab657f8e5653ca2b284015a537e51e455afdb3 100644 (file)
@@ -549,6 +549,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                        }
                } else if (is_migration_entry(swpent))
                        page = migration_entry_to_page(swpent);
+               else if (is_device_private_entry(swpent))
+                       page = device_private_entry_to_page(swpent);
        } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
                                                        && pte_none(*pte))) {
                page = find_get_entry(vma->vm_file->f_mapping,
@@ -713,6 +715,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 
                if (is_migration_entry(swpent))
                        page = migration_entry_to_page(swpent);
+               else if (is_device_private_entry(swpent))
+                       page = device_private_entry_to_page(swpent);
        }
        if (page) {
                int mapcount = page_mapcount(page);
@@ -1276,6 +1280,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
                flags |= PM_SWAP;
                if (is_migration_entry(entry))
                        page = migration_entry_to_page(entry);
+
+               if (is_device_private_entry(entry))
+                       page = device_private_entry_to_page(entry);
        }
 
        if (page && !PageAnon(page))
index 6230064d7f95d072d61e2bc0eedd341463880d32..3a4f69137bc2fa732dca27167573a9cec506dd2a 100644 (file)
@@ -130,6 +130,7 @@ enum {
        IORES_DESC_ACPI_NV_STORAGE              = 3,
        IORES_DESC_PERSISTENT_MEMORY            = 4,
        IORES_DESC_PERSISTENT_MEMORY_LEGACY     = 5,
+       IORES_DESC_DEVICE_PRIVATE_MEMORY        = 6,
 };
 
 /* helpers to define resources */
index 93416196ba64f6b4747ad9f37aaf5a913ff3ac29..8e164ec9eed0e34e8e1d6654fced5f9b64f97d5c 100644 (file)
@@ -4,6 +4,8 @@
 #include <linux/ioport.h>
 #include <linux/percpu-refcount.h>
 
+#include <asm/pgtable.h>
+
 struct resource;
 struct device;
 
@@ -35,18 +37,89 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 }
 #endif
 
+/*
+ * Specialize ZONE_DEVICE memory into multiple types each having differents
+ * usage.
+ *
+ * MEMORY_DEVICE_HOST:
+ * Persistent device memory (pmem): struct page might be allocated in different
+ * memory and architecture might want to perform special actions. It is similar
+ * to regular memory, in that the CPU can access it transparently. However,
+ * it is likely to have different bandwidth and latency than regular memory.
+ * See Documentation/nvdimm/nvdimm.txt for more information.
+ *
+ * MEMORY_DEVICE_PRIVATE:
+ * Device memory that is not directly addressable by the CPU: CPU can neither
+ * read nor write private memory. In this case, we do still have struct pages
+ * backing the device memory. Doing so simplifies the implementation, but it is
+ * important to remember that there are certain points at which the struct page
+ * must be treated as an opaque object, rather than a "normal" struct page.
+ *
+ * A more complete discussion of unaddressable memory may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ */
+enum memory_type {
+       MEMORY_DEVICE_HOST = 0,
+       MEMORY_DEVICE_PRIVATE,
+};
+
+/*
+ * For MEMORY_DEVICE_PRIVATE we use ZONE_DEVICE and extend it with two
+ * callbacks:
+ *   page_fault()
+ *   page_free()
+ *
+ * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
+ * explanation in include/linux/memory_hotplug.h.
+ *
+ * The page_fault() callback must migrate page back, from device memory to
+ * system memory, so that the CPU can access it. This might fail for various
+ * reasons (device issues,  device have been unplugged, ...). When such error
+ * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
+ * set the CPU page table entry to "poisoned".
+ *
+ * Note that because memory cgroup charges are transferred to the device memory,
+ * this should never fail due to memory restrictions. However, allocation
+ * of a regular system page might still fail because we are out of memory. If
+ * that happens, the page_fault() callback must return VM_FAULT_OOM.
+ *
+ * The page_fault() callback can also try to migrate back multiple pages in one
+ * chunk, as an optimization. It must, however, prioritize the faulting address
+ * over all the others.
+ *
+ *
+ * The page_free() callback is called once the page refcount reaches 1
+ * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
+ * This allows the device driver to implement its own memory management.)
+ */
+typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
+                               unsigned long addr,
+                               const struct page *page,
+                               unsigned int flags,
+                               pmd_t *pmdp);
+typedef void (*dev_page_free_t)(struct page *page, void *data);
+
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
+ * @page_fault: callback when CPU fault on an unaddressable device page
+ * @page_free: free page callback when page refcount reaches 1
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
  * @dev: host device of the mapping for debug
+ * @data: private data pointer for page_free()
+ * @type: memory type: see MEMORY_* in memory_hotplug.h
  */
 struct dev_pagemap {
+       dev_page_fault_t page_fault;
+       dev_page_free_t page_free;
        struct vmem_altmap *altmap;
        const struct resource *res;
        struct percpu_ref *ref;
        struct device *dev;
+       void *data;
+       enum memory_type type;
 };
 
 #ifdef CONFIG_ZONE_DEVICE
index 39db8e54c5d50a98ee9cf49eb5cf16e5271095a4..a74c4e95435223e835510d80764ff33e52b06dfd 100644 (file)
@@ -792,11 +792,23 @@ static inline bool is_zone_device_page(const struct page *page)
 {
        return page_zonenum(page) == ZONE_DEVICE;
 }
+
+static inline bool is_device_private_page(const struct page *page)
+{
+       /* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */
+       return ((page_zonenum(page) == ZONE_DEVICE) &&
+               (page->pgmap->type == MEMORY_DEVICE_PRIVATE));
+}
 #else
 static inline bool is_zone_device_page(const struct page *page)
 {
        return false;
 }
+
+static inline bool is_device_private_page(const struct page *page)
+{
+       return false;
+}
 #endif
 
 static inline void get_page(struct page *page)
index 8bf3487fb2046b1a03e4d28fca762c35281cc219..8a807292037f9ff956980260ba61c0c4c22dd07f 100644 (file)
@@ -50,6 +50,23 @@ static inline int current_is_kswapd(void)
  * actions on faults.
  */
 
+/*
+ * Unaddressable device memory support. See include/linux/hmm.h and
+ * Documentation/vm/hmm.txt. Short description is we need struct pages for
+ * device memory that is unaddressable (inaccessible) by CPU, so that we can
+ * migrate part of a process memory to device memory.
+ *
+ * When a page is migrated from CPU to device, we set the CPU page table entry
+ * to a special SWP_DEVICE_* entry.
+ */
+#ifdef CONFIG_DEVICE_PRIVATE
+#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
+#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#else
+#define SWP_DEVICE_NUM 0
+#endif
+
 /*
  * NUMA node memory migration support
  */
@@ -72,7 +89,8 @@ static inline int current_is_kswapd(void)
 #endif
 
 #define MAX_SWAPFILES \
-       ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+       ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
+       SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
 
 /*
  * Magic header for a swap area. The first part of the union is
@@ -469,8 +487,8 @@ static inline void show_swap_cache_info(void)
 {
 }
 
-#define free_swap_and_cache(swp)       is_migration_entry(swp)
-#define swapcache_prepare(swp)         is_migration_entry(swp)
+#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
 
 static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 {
index 45b092aa6419959c37ce0f7d57d64ba751a43c01..291c4b53465819293aaa9c25bbed1b932f449f1f 100644 (file)
@@ -100,6 +100,74 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
        return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
 
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+{
+       return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
+                        page_to_pfn(page));
+}
+
+static inline bool is_device_private_entry(swp_entry_t entry)
+{
+       int type = swp_type(entry);
+       return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
+}
+
+static inline void make_device_private_entry_read(swp_entry_t *entry)
+{
+       *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
+}
+
+static inline bool is_write_device_private_entry(swp_entry_t entry)
+{
+       return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
+}
+
+static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+{
+       return pfn_to_page(swp_offset(entry));
+}
+
+int device_private_entry_fault(struct vm_area_struct *vma,
+                      unsigned long addr,
+                      swp_entry_t entry,
+                      unsigned int flags,
+                      pmd_t *pmdp);
+#else /* CONFIG_DEVICE_PRIVATE */
+static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+{
+       return swp_entry(0, 0);
+}
+
+static inline void make_device_private_entry_read(swp_entry_t *entry)
+{
+}
+
+static inline bool is_device_private_entry(swp_entry_t entry)
+{
+       return false;
+}
+
+static inline bool is_write_device_private_entry(swp_entry_t entry)
+{
+       return false;
+}
+
+static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+{
+       return NULL;
+}
+
+static inline int device_private_entry_fault(struct vm_area_struct *vma,
+                                    unsigned long addr,
+                                    swp_entry_t entry,
+                                    unsigned int flags,
+                                    pmd_t *pmdp)
+{
+       return VM_FAULT_SIGBUS;
+}
+#endif /* CONFIG_DEVICE_PRIVATE */
+
 #ifdef CONFIG_MIGRATION
 static inline swp_entry_t make_migration_entry(struct page *page, int write)
 {
index 066e73c2fcc9dbbd89c13923e203ed47aaff7d58..f1d1e0dfe8b4f693db64668c46873a56ab8995d7 100644 (file)
@@ -18,6 +18,8 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/memory_hotplug.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 #ifndef ioremap_cache
 /* temporary while we convert existing ioremap_cache users to memremap */
@@ -219,6 +221,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff)
        for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
                        pgoff += 1UL << order, order = order_at((res), pgoff))
 
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+int device_private_entry_fault(struct vm_area_struct *vma,
+                      unsigned long addr,
+                      swp_entry_t entry,
+                      unsigned int flags,
+                      pmd_t *pmdp)
+{
+       struct page *page = device_private_entry_to_page(entry);
+
+       /*
+        * The page_fault() callback must migrate page back to system memory
+        * so that CPU can access it. This might fail for various reasons
+        * (device issue, device was unsafely unplugged, ...). When such
+        * error conditions happen, the callback must return VM_FAULT_SIGBUS.
+        *
+        * Note that because memory cgroup charges are accounted to the device
+        * memory, this should never fail because of memory restrictions (but
+        * allocation of regular system page might still fail because we are
+        * out of memory).
+        *
+        * There is a more in-depth description of what that callback can and
+        * cannot do, in include/linux/memremap.h
+        */
+       return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+}
+EXPORT_SYMBOL(device_private_entry_fault);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
 static void pgmap_radix_release(struct resource *res)
 {
        unsigned long pgoff, order;
@@ -356,6 +386,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        }
        pgmap->ref = ref;
        pgmap->res = &page_map->res;
+       pgmap->type = MEMORY_DEVICE_HOST;
+       pgmap->page_fault = NULL;
+       pgmap->page_free = NULL;
+       pgmap->data = NULL;
 
        mutex_lock(&pgmap_lock);
        error = 0;
index 254db99f263dc929f6edc3848258a35794854934..ec27855db1339da1f24c616ee1d6750d0bc33d1e 100644 (file)
@@ -676,7 +676,7 @@ config ARCH_HAS_ZONE_DEVICE
        bool
 
 config ZONE_DEVICE
-       bool "Device memory (pmem, etc...) hotplug support"
+       bool "Device memory (pmem, HMM, etc...) hotplug support"
        depends on MEMORY_HOTPLUG
        depends on MEMORY_HOTREMOVE
        depends on SPARSEMEM_VMEMMAP
@@ -717,6 +717,15 @@ config HMM_MIRROR
          page tables (at PAGE_SIZE granularity), and must be able to recover from
          the resulting potential page faults.
 
+config DEVICE_PRIVATE
+       bool "Unaddressable device memory (GPU memory, ...)"
+       depends on ARCH_HAS_HMM
+
+       help
+         Allows creation of struct pages to represent unaddressable device
+         memory; i.e., memory that is only accessible from the device (or
+         group of devices). You likely also want to select HMM_MIRROR.
+
 config FRAME_VECTOR
        bool
 
index 886033b95fd2754f8d34a8e12c627bfd88d9839c..079eeac0b009790ac6c707eb65f05562f9ddcb86 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/memremap.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
@@ -956,6 +957,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                        pte = pte_swp_mksoft_dirty(pte);
                                set_pte_at(src_mm, addr, src_pte, pte);
                        }
+               } else if (is_device_private_entry(entry)) {
+                       page = device_private_entry_to_page(entry);
+
+                       /*
+                        * Update rss count even for unaddressable pages, as
+                        * they should treated just like normal pages in this
+                        * respect.
+                        *
+                        * We will likely want to have some new rss counters
+                        * for unaddressable pages, at some point. But for now
+                        * keep things as they are.
+                        */
+                       get_page(page);
+                       rss[mm_counter(page)]++;
+                       page_dup_rmap(page, false);
+
+                       /*
+                        * We do not preserve soft-dirty information, because so
+                        * far, checkpoint/restore is the only feature that
+                        * requires that. And checkpoint/restore does not work
+                        * when a device driver is involved (you cannot easily
+                        * save and restore device driver state).
+                        */
+                       if (is_write_device_private_entry(entry) &&
+                           is_cow_mapping(vm_flags)) {
+                               make_device_private_entry_read(&entry);
+                               pte = swp_entry_to_pte(entry);
+                               set_pte_at(src_mm, addr, src_pte, pte);
+                       }
                }
                goto out_set_pte;
        }
@@ -1274,6 +1304,29 @@ again:
                        }
                        continue;
                }
+
+               entry = pte_to_swp_entry(ptent);
+               if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+                       struct page *page = device_private_entry_to_page(entry);
+
+                       if (unlikely(details && details->check_mapping)) {
+                               /*
+                                * unmap_shared_mapping_pages() wants to
+                                * invalidate cache without truncating:
+                                * unmap shared but keep private pages.
+                                */
+                               if (details->check_mapping !=
+                                   page_rmapping(page))
+                                       continue;
+                       }
+
+                       pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+                       rss[mm_counter(page)]--;
+                       page_remove_rmap(page, false);
+                       put_page(page);
+                       continue;
+               }
+
                /* If details->check_mapping, we leave swap entries. */
                if (unlikely(details))
                        continue;
@@ -2776,6 +2829,14 @@ int do_swap_page(struct vm_fault *vmf)
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
+               } else if (is_device_private_entry(entry)) {
+                       /*
+                        * For un-addressable device memory we call the pgmap
+                        * fault handler callback. The callback must migrate
+                        * the page back to some CPU accessible page.
+                        */
+                       ret = device_private_entry_fault(vma, vmf->address, entry,
+                                                vmf->flags, vmf->pmd);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else {
index 1f92fb84770d27d9adf4b3516db7f01f7fdf9c0a..e882cb6da99425bad30e4017c5feeadacba2e8ce 100644 (file)
@@ -99,7 +99,7 @@ void mem_hotplug_done(void)
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size)
 {
-       struct resource *res;
+       struct resource *res, *conflict;
        res = kzalloc(sizeof(struct resource), GFP_KERNEL);
        if (!res)
                return ERR_PTR(-ENOMEM);
@@ -108,7 +108,13 @@ static struct resource *register_memory_resource(u64 start, u64 size)
        res->start = start;
        res->end = start + size - 1;
        res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-       if (request_resource(&iomem_resource, res) < 0) {
+       conflict =  request_resource_conflict(&iomem_resource, res);
+       if (conflict) {
+               if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+                       pr_debug("Device unaddressable memory block "
+                                "memory hotplug at %#010llx !\n",
+                                (unsigned long long)start);
+               }
                pr_debug("System RAM resource %pR cannot be added\n", res);
                kfree(res);
                return ERR_PTR(-EEXIST);
index a1bfe954577064149468e06e74191a1eeeeae6d0..6d3e2f0822901605aa3554234e3e96838997b0ba 100644 (file)
@@ -125,6 +125,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
                                pages++;
                        }
+
+                       if (is_write_device_private_entry(entry)) {
+                               pte_t newpte;
+
+                               /*
+                                * We do not preserve soft-dirtiness. See
+                                * copy_one_pte() for explanation.
+                                */
+                               make_device_private_entry_read(&entry);
+                               newpte = swp_entry_to_pte(entry);
+                               set_pte_at(mm, addr, pte, newpte);
+
+                               pages++;
+                       }
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();